All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/39] fixes for MDS cluster recovery
@ 2013-03-17 14:51 Yan, Zheng
  2013-03-17 14:51 ` [PATCH 01/39] mds: preserve subtree bounds until slave commit Yan, Zheng
                   ` (40 more replies)
  0 siblings, 41 replies; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

This serie fixes issues I encountered when running random MDS restart tests.
With these patches, my 3 MDS setup that runs fsstress + thrash_exports can
survive restarting one or two MDS dozens of times.

But there still are lots of unsolved problems, Sometimes rstat corruption,
request hangs ...

This patch series are also in:
  git://github.com/ukernel/ceph.git wip-mds

Regards
Yan, Zheng

^ permalink raw reply	[flat|nested] 117+ messages in thread

* [PATCH 01/39] mds: preserve subtree bounds until slave commit
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 18:33   ` Greg Farnum
  2013-03-17 14:51 ` [PATCH 02/39] mds: process finished contexts in batch Yan, Zheng
                   ` (39 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

When replaying an operation that rename a directory inode to non-auth subtree,
if the inode has subtree bounds, we should prevent them from being trimmed
until slave commit.

This patch also fixes a bug in ESlaveUpdate::replay(). EMetaBlob::replay()
should be called before MDCache::finish_uncommitted_slave_update().

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 21 +++++++++++----------
 src/mds/Mutation.h |  5 ++---
 src/mds/journal.cc | 13 +++++++++----
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index fddcfc6..684e70b 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3016,10 +3016,10 @@ void MDCache::add_uncommitted_slave_update(metareqid_t reqid, int master, MDSlav
 {
   assert(uncommitted_slave_updates[master].count(reqid) == 0);
   uncommitted_slave_updates[master][reqid] = su;
-  if (su->rename_olddir)
-    uncommitted_slave_rename_olddir[su->rename_olddir]++;
+  for(set<CDir*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
+    uncommitted_slave_rename_olddir[*p]++;
   for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
-     uncommitted_slave_unlink[*p]++;
+    uncommitted_slave_unlink[*p]++;
 }
 
 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, int master)
@@ -3031,11 +3031,12 @@ void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, int master)
   if (uncommitted_slave_updates[master].empty())
     uncommitted_slave_updates.erase(master);
   // discard the non-auth subtree we renamed out of
-  if (su->rename_olddir) {
-    uncommitted_slave_rename_olddir[su->rename_olddir]--;
-    if (uncommitted_slave_rename_olddir[su->rename_olddir] == 0) {
-      uncommitted_slave_rename_olddir.erase(su->rename_olddir);
-      CDir *root = get_subtree_root(su->rename_olddir);
+  for(set<CDir*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
+    CDir *dir = *p;
+    uncommitted_slave_rename_olddir[dir]--;
+    if (uncommitted_slave_rename_olddir[dir] == 0) {
+      uncommitted_slave_rename_olddir.erase(dir);
+      CDir *root = get_subtree_root(dir);
       if (root->get_dir_auth() == CDIR_AUTH_UNDEF)
 	try_trim_non_auth_subtree(root);
     }
@@ -6052,8 +6053,8 @@ bool MDCache::trim_non_auth_subtree(CDir *dir)
 {
   dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
 
-  // preserve the dir for rollback
-  if (uncommitted_slave_rename_olddir.count(dir))
+  if (uncommitted_slave_rename_olddir.count(dir) || // preserve the dir for rollback
+      my_ambiguous_imports.count(dir->dirfrag()))
     return true;
 
   bool keep_dir = false;
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index 55b84eb..5013f04 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -315,13 +315,12 @@ struct MDSlaveUpdate {
   bufferlist rollback;
   elist<MDSlaveUpdate*>::item item;
   Context *waiter;
-  CDir* rename_olddir;
+  set<CDir*> olddirs;
   set<CInode*> unlinked;
   MDSlaveUpdate(int oo, bufferlist &rbl, elist<MDSlaveUpdate*> &list) :
     origop(oo),
     item(this),
-    waiter(0),
-    rename_olddir(0) {
+    waiter(0) {
     rollback.claim(rbl);
     list.push_back(&item);
   }
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 5b3bd71..3375e40 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -1131,10 +1131,15 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
     if (olddir) {
       if (olddir->authority() != CDIR_AUTH_UNDEF &&
 	  renamed_diri->authority() == CDIR_AUTH_UNDEF) {
+	assert(slaveup); // auth to non-auth, must be slave prepare
 	list<frag_t> leaves;
 	renamed_diri->dirfragtree.get_leaves(leaves);
-	for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p)
-	  renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
+	for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) {
+	  CDir *dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
+	  // preserve subtree bound until slave commit
+	  if (dir->authority() == CDIR_AUTH_UNDEF)
+	    slaveup->olddirs.insert(dir);
+	}
       }
 
       mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
@@ -1143,7 +1148,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
       CDir *root = mds->mdcache->get_subtree_root(olddir);
       if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
 	if (slaveup) // preserve the old dir until slave commit
-	  slaveup->rename_olddir = olddir;
+	  slaveup->olddirs.insert(olddir);
 	else
 	  mds->mdcache->try_trim_non_auth_subtree(root);
       }
@@ -2122,10 +2127,10 @@ void ESlaveUpdate::replay(MDS *mds)
   case ESlaveUpdate::OP_ROLLBACK:
     dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
 	     << ": applying rollback commit blob" << dendl;
+    commit.replay(mds, _segment);
     su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
     if (su)
       mds->mdcache->finish_uncommitted_slave_update(reqid, master);
-    commit.replay(mds, _segment);
     break;
 
   default:
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 02/39] mds: process finished contexts in batch
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
  2013-03-17 14:51 ` [PATCH 01/39] mds: preserve subtree bounds until slave commit Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 18:33   ` Greg Farnum
  2013-03-17 14:51 ` [PATCH 03/39] mds: fix MDCache::adjust_bounded_subtree_auth() Yan, Zheng
                   ` (38 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

If there are several unstable locks in an inode, current Locker::eval(CInode*,)
processes each lock's finished contexts seperately. This may cause very deep
call stack if finished contexts also call Locker::eval() on the same inode.
An extreme example is:

Locker::eval() wakes an open request(). Server::handle_client_open() starts
a log entry, then call Locker::issue_new_caps(). Locker::issue_new_caps()
calls Locker::eval() and wakes another request. The later request also tries
starting a log entry.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Locker.cc | 17 ++++++++++-------
 src/mds/Locker.h  |  4 ++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index b61fb14..d06a9cc 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -803,6 +803,7 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<C
 bool Locker::eval(CInode *in, int mask, bool caps_imported)
 {
   bool need_issue = false;
+  list<Context*> finishers;
   
   dout(10) << "eval " << mask << " " << *in << dendl;
 
@@ -821,19 +822,19 @@ bool Locker::eval(CInode *in, int mask, bool caps_imported)
 
  retry:
   if (mask & CEPH_LOCK_IFILE)
-    eval_any(&in->filelock, &need_issue, caps_imported);
+    eval_any(&in->filelock, &need_issue, &finishers, caps_imported);
   if (mask & CEPH_LOCK_IAUTH)
-    eval_any(&in->authlock, &need_issue, caps_imported);
+    eval_any(&in->authlock, &need_issue, &finishers, caps_imported);
   if (mask & CEPH_LOCK_ILINK)
-    eval_any(&in->linklock, &need_issue,caps_imported);
+    eval_any(&in->linklock, &need_issue, &finishers, caps_imported);
   if (mask & CEPH_LOCK_IXATTR)
-    eval_any(&in->xattrlock, &need_issue, caps_imported);
+    eval_any(&in->xattrlock, &need_issue, &finishers, caps_imported);
   if (mask & CEPH_LOCK_INEST)
-    eval_any(&in->nestlock, &need_issue, caps_imported);
+    eval_any(&in->nestlock, &need_issue, &finishers, caps_imported);
   if (mask & CEPH_LOCK_IFLOCK)
-    eval_any(&in->flocklock, &need_issue, caps_imported);
+    eval_any(&in->flocklock, &need_issue, &finishers, caps_imported);
   if (mask & CEPH_LOCK_IPOLICY)
-    eval_any(&in->policylock, &need_issue, caps_imported);
+    eval_any(&in->policylock, &need_issue, &finishers, caps_imported);
 
   // drop loner?
   if (in->is_auth() && in->is_head() && in->get_wanted_loner() != in->get_loner()) {
@@ -854,6 +855,8 @@ bool Locker::eval(CInode *in, int mask, bool caps_imported)
     }
   }
 
+  finish_contexts(g_ceph_context, finishers);
+
   if (need_issue && in->is_head())
     issue_caps(in);
 
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index f005925..3f79996 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -99,9 +99,9 @@ public:
 
   void eval_gather(SimpleLock *lock, bool first=false, bool *need_issue=0, list<Context*> *pfinishers=0);
   void eval(SimpleLock *lock, bool *need_issue);
-  void eval_any(SimpleLock *lock, bool *need_issue, bool first=false) {
+  void eval_any(SimpleLock *lock, bool *need_issue, list<Context*> *pfinishers=0, bool first=false) {
     if (!lock->is_stable())
-      eval_gather(lock, first, need_issue);
+      eval_gather(lock, first, need_issue, pfinishers);
     else if (lock->get_parent()->is_auth())
       eval(lock, need_issue);
   }
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 03/39] mds: fix MDCache::adjust_bounded_subtree_auth()
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
  2013-03-17 14:51 ` [PATCH 01/39] mds: preserve subtree bounds until slave commit Yan, Zheng
  2013-03-17 14:51 ` [PATCH 02/39] mds: process finished contexts in batch Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 18:33   ` Greg Farnum
  2013-03-17 14:51 ` [PATCH 04/39] mds: make sure table request id unique Yan, Zheng
                   ` (37 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

There are cases that need both create new bound and swallow intervening
subtree. For example: A MDS exports subtree A with bound B and imports
subtree B with bound C at the same time. The MDS crashes, exporting
subtree A fails, but importing subtree B succeed. During recovery, the
MDS may create new bound C and swallow subtree B.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 684e70b..19dc60b 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -980,15 +980,21 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, pair<in
       }
       else {
 	dout(10) << "  want bound " << *bound << dendl;
+	CDir *t = get_subtree_root(bound->get_parent_dir());
+	if (subtrees[t].count(bound) == 0) {
+	  assert(t != dir);
+	  dout(10) << "  new bound " << *bound << dendl;
+	  adjust_subtree_auth(bound, t->authority());
+	}
 	// make sure it's nested beneath ambiguous subtree(s)
 	while (1) {
-	  CDir *t = get_subtree_root(bound->get_parent_dir());
-	  if (t == dir) break;
 	  while (subtrees[dir].count(t) == 0)
 	    t = get_subtree_root(t->get_parent_dir());
 	  dout(10) << "  swallowing intervening subtree at " << *t << dendl;
 	  adjust_subtree_auth(t, auth);
 	  try_subtree_merge_at(t);
+	  t = get_subtree_root(bound->get_parent_dir());
+	  if (t == dir) break;
 	}
       }
     }
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 04/39] mds: make sure table request id unique
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (2 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 03/39] mds: fix MDCache::adjust_bounded_subtree_auth() Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-19 23:09   ` Greg Farnum
  2013-03-17 14:51 ` [PATCH 05/39] mds: send table request when peer is in proper state Yan, Zheng
                   ` (36 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

When a MDS becomes active, the table server re-sends 'agree' messages
for old prepared request. If the recoverd MDS starts a new table request
at the same time, The new request's ID can happen to be the same as old
prepared request's ID, because current table client assigns request ID
from zero after MDS restarts.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDS.cc            | 3 +++
 src/mds/MDSTableClient.cc | 5 +++++
 src/mds/MDSTableClient.h  | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index bb1c833..859782a 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1212,6 +1212,9 @@ void MDS::boot_start(int step, int r)
 	dout(2) << "boot_start " << step << ": opening snap table" << dendl;	
 	snapserver->load(gather.new_sub());
       }
+
+      anchorclient->init();
+      snapclient->init();
       
       dout(2) << "boot_start " << step << ": opening mds log" << dendl;
       mdlog->open(gather.new_sub());
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
index ea021f5..beba0a3 100644
--- a/src/mds/MDSTableClient.cc
+++ b/src/mds/MDSTableClient.cc
@@ -34,6 +34,11 @@
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".tableclient(" << get_mdstable_name(table) << ") "
 
+void MDSTableClient::init()
+{
+  // make reqid unique between MDS restarts
+  last_reqid = (uint64_t)mds->mdsmap->get_epoch() << 32;
+}
 
 void MDSTableClient::handle_request(class MMDSTableRequest *m)
 {
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
index e15837f..78035db 100644
--- a/src/mds/MDSTableClient.h
+++ b/src/mds/MDSTableClient.h
@@ -63,6 +63,8 @@ public:
   MDSTableClient(MDS *m, int tab) : mds(m), table(tab), last_reqid(0) {}
   virtual ~MDSTableClient() {}
 
+  void init();
+
   void handle_request(MMDSTableRequest *m);
 
   void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, Context *onfinish);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 05/39] mds: send table request when peer is in proper state.
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (3 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 04/39] mds: make sure table request id unique Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 18:34   ` Greg Farnum
  2013-03-29 21:58   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 06/39] mds: make table client/server tolerate duplicated message Yan, Zheng
                   ` (35 subsequent siblings)
  40 siblings, 2 replies; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Table client/server should send request/reply when the peer is active.
Anchor query is an exception, because MDS in rejoin stage may need
fetch files before sending rejoin ack, the anchor server can also be
in rejoin stage.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/AnchorClient.cc   | 5 ++++-
 src/mds/MDSTableClient.cc | 9 ++++++---
 src/mds/MDSTableServer.cc | 3 ++-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/mds/AnchorClient.cc b/src/mds/AnchorClient.cc
index 455e97f..d7da9d1 100644
--- a/src/mds/AnchorClient.cc
+++ b/src/mds/AnchorClient.cc
@@ -80,9 +80,12 @@ void AnchorClient::lookup(inodeno_t ino, vector<Anchor>& trace, Context *onfinis
 
 void AnchorClient::_lookup(inodeno_t ino)
 {
+  int ts = mds->mdsmap->get_tableserver();
+  if (mds->mdsmap->get_state(ts) < MDSMap::STATE_REJOIN)
+    return;
   MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_QUERY, 0, 0);
   ::encode(ino, req->bl);
-  mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  mds->send_message_mds(req, ts);
 }
 
 
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
index beba0a3..df0131f 100644
--- a/src/mds/MDSTableClient.cc
+++ b/src/mds/MDSTableClient.cc
@@ -149,9 +149,10 @@ void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist
 void MDSTableClient::send_to_tableserver(MMDSTableRequest *req)
 {
   int ts = mds->mdsmap->get_tableserver();
-  if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY)
+  if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY) {
     mds->send_message_mds(req, ts);
-  else {
+  } else {
+    req->put();
     dout(10) << " deferring request to not-yet-active tableserver mds." << ts << dendl;
   }
 }
@@ -193,7 +194,9 @@ void MDSTableClient::got_journaled_ack(version_t tid)
 void MDSTableClient::finish_recovery()
 {
   dout(7) << "finish_recovery" << dendl;
-  resend_commits();
+  int ts = mds->mdsmap->get_tableserver();
+  if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY)
+    resend_commits();
 }
 
 void MDSTableClient::resend_commits()
diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
index 4f86ff1..07c7d26 100644
--- a/src/mds/MDSTableServer.cc
+++ b/src/mds/MDSTableServer.cc
@@ -159,7 +159,8 @@ void MDSTableServer::handle_mds_recovery(int who)
   for (map<version_t,mds_table_pending_t>::iterator p = pending_for_mds.begin();
        p != pending_for_mds.end();
        ++p) {
-    if (who >= 0 && p->second.mds != who)
+    if ((who >= 0 && p->second.mds != who) ||
+	mds->mdsmap->get_state(p->second.mds) < MDSMap::STATE_CLIENTREPLAY)
       continue;
     MMDSTableRequest *reply = new MMDSTableRequest(table, TABLESERVER_OP_AGREE, p->second.reqid, p->second.tid);
     mds->send_message_mds(reply, p->second.mds);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 06/39] mds: make table client/server tolerate duplicated message
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (4 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 05/39] mds: send table request when peer is in proper state Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-29 22:00   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 07/39] mds: mark connection down when MDS fails Yan, Zheng
                   ` (34 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Anchor client re-sends queries when the anchor server becomes active.
So it's possible to get duplicated query reply.

When the table server recovers, the clients re-send commits to the
server, the server re-sends 'agree' messages to the clients. When
the clients receive the 'agree' messages, they may send another
commit/rollback message to the server.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/AnchorClient.cc   |  4 +++-
 src/mds/AnchorServer.cc   |  6 ++++--
 src/mds/MDSTableServer.cc | 22 ++++++++++++++++------
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/mds/AnchorClient.cc b/src/mds/AnchorClient.cc
index d7da9d1..bcc8710 100644
--- a/src/mds/AnchorClient.cc
+++ b/src/mds/AnchorClient.cc
@@ -41,7 +41,9 @@ void AnchorClient::handle_query_result(class MMDSTableRequest *m)
   ::decode(ino, p);
   ::decode(trace, p);
 
-  assert(pending_lookup.count(ino));
+  if (!pending_lookup.count(ino))
+    return;
+
   list<_pending_lookup> ls;
   ls.swap(pending_lookup[ino]);
   pending_lookup.erase(ino);
diff --git a/src/mds/AnchorServer.cc b/src/mds/AnchorServer.cc
index 6f37e53..594bf7b 100644
--- a/src/mds/AnchorServer.cc
+++ b/src/mds/AnchorServer.cc
@@ -213,10 +213,12 @@ bool AnchorServer::check_pending(version_t tid, MMDSTableRequest *req, list<Cont
       ++p;
     }
     assert(p != pending.end());
-    assert(p->second == NULL);
     // not the earliest pending operation, wait if it's a commit
     if (req) {
-      p->second = new C_MDS_RetryMessage(mds, req);
+      if (p->second == NULL)
+	p->second = new C_MDS_RetryMessage(mds, req);
+      else
+	req->put(); // duplicated commit
       return false;
     }
   }
diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
index 07c7d26..730606f 100644
--- a/src/mds/MDSTableServer.cc
+++ b/src/mds/MDSTableServer.cc
@@ -120,15 +120,25 @@ void MDSTableServer::_commit_logged(MMDSTableRequest *req)
 void MDSTableServer::handle_rollback(MMDSTableRequest *req)
 {
   dout(7) << "handle_rollback " << *req << dendl;
-  _rollback(req->get_tid());
-  _note_rollback(req->get_tid());
-  mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_ROLLBACK, 0, -1, 
-						  req->get_tid(), version));
+
+  version_t tid = req->get_tid();
+  if (pending_for_mds.count(tid)) {
+    _rollback(tid);
+    _note_rollback(tid);
+    mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_ROLLBACK, 0, -1,
+	  tid, version));
+  } else if (tid <= version) {
+    dout(0) << "got rollback for tid " << tid << " <= " << version
+	    << ", already rollbacked or committed." << dendl;
+  }
+  else {
+    // wtf.
+    dout(0) << "got rollbacked for tid " << tid << " > " << version << dendl;
+    assert(tid <= version);
+  }
   req->put();
 }
 
-
-
 // SERVER UPDATE
 
 void MDSTableServer::do_server_update(bufferlist& bl)
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 07/39] mds: mark connection down when MDS fails
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (5 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 06/39] mds: make table client/server tolerate duplicated message Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 18:37   ` Greg Farnum
  2013-03-17 14:51 ` [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state Yan, Zheng
                   ` (33 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

So if the MDS restarts and uses the same address, it does not get
old messages.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDS.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 859782a..282fa64 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1046,8 +1046,10 @@ void MDS::handle_mds_map(MMDSMap *m)
     oldmap->get_failed_mds_set(oldfailed);
     mdsmap->get_failed_mds_set(failed);
     for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p)
-      if (oldfailed.count(*p) == 0)
+      if (oldfailed.count(*p) == 0) {
+	messenger->mark_down(oldmap->get_inst(*p).addr);
 	mdcache->handle_mds_failure(*p);
+      }
     
     // or down then up?
     //  did their addr/inst change?
@@ -1055,8 +1057,10 @@ void MDS::handle_mds_map(MMDSMap *m)
     mdsmap->get_up_mds_set(up);
     for (set<int>::iterator p = up.begin(); p != up.end(); ++p) 
       if (oldmap->have_inst(*p) &&
-	  oldmap->get_inst(*p) != mdsmap->get_inst(*p))
+	  oldmap->get_inst(*p) != mdsmap->get_inst(*p)) {
+	messenger->mark_down(oldmap->get_inst(*p).addr);
 	mdcache->handle_mds_failure(*p);
+      }
   }
   if (is_clientreplay() || is_active() || is_stopping()) {
     // did anyone stop?
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state.
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (6 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 07/39] mds: mark connection down when MDS fails Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 18:40   ` Greg Farnum
  2013-03-20 19:09   ` Greg Farnum
  2013-03-17 14:51 ` [PATCH 09/39] mds: defer eval gather locks when removing replica Yan, Zheng
                   ` (32 subsequent siblings)
  40 siblings, 2 replies; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

MDS in clientreply state already start servering requests. It also
make MDS::handle_mds_recovery() and MDS::recovery_done() match.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDS.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 282fa64..b91dcbd 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1032,7 +1032,9 @@ void MDS::handle_mds_map(MMDSMap *m)
 
     set<int> oldactive, active;
     oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
+    oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
     mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
+    mdsmap->get_mds_set(active, MDSMap::STATE_CLIENTREPLAY);
     for (set<int>::iterator p = active.begin(); p != active.end(); ++p) 
       if (*p != whoami &&            // not me
 	  oldactive.count(*p) == 0)  // newly so?
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 09/39] mds: defer eval gather locks when removing replica
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (7 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 19:36   ` Greg Farnum
  2013-03-17 14:51 ` [PATCH 10/39] mds: unify slave request waiting Yan, Zheng
                   ` (31 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Locks' states should not change between composing the cache rejoin ack
messages and sending the message. If Locker::eval_gather() is called
in MDCache::{inode,dentry}_remove_replica(), it may wake requests and
change locks' states.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 51 ++++++++++++++++++++++++++++++---------------------
 src/mds/MDCache.h  |  8 +++++---
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 19dc60b..0f6b842 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3729,6 +3729,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
   // possible response(s)
   MMDSCacheRejoin *ack = 0;      // if survivor
   set<vinodeno_t> acked_inodes;  // if survivor
+  set<SimpleLock *> gather_locks;  // if survivor
   bool survivor = false;  // am i a survivor?
 
   if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
@@ -3851,7 +3852,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
       assert(dnl->is_primary());
       
       if (survivor && dn->is_replica(from)) 
-	dentry_remove_replica(dn, from);  // this induces a lock gather completion
+	dentry_remove_replica(dn, from, gather_locks);  // this induces a lock gather completion
       int dnonce = dn->add_replica(from);
       dout(10) << " have " << *dn << dendl;
       if (ack) 
@@ -3864,7 +3865,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
       assert(in);
 
       if (survivor && in->is_replica(from)) 
-	inode_remove_replica(in, from);
+	inode_remove_replica(in, from, gather_locks);
       int inonce = in->add_replica(from);
       dout(10) << " have " << *in << dendl;
 
@@ -3887,7 +3888,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
     CInode *in = get_inode(*p);
     assert(in);   // hmm fixme wrt stray?
     if (survivor && in->is_replica(from)) 
-      inode_remove_replica(in, from);    // this induces a lock gather completion
+      inode_remove_replica(in, from, gather_locks);    // this induces a lock gather completion
     int inonce = in->add_replica(from);
     dout(10) << " have base " << *in << dendl;
     
@@ -3909,8 +3910,11 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
       ack->add_inode_base(in);
     }
 
-    rejoin_scour_survivor_replicas(from, ack, acked_inodes);
+    rejoin_scour_survivor_replicas(from, ack, gather_locks, acked_inodes);
     mds->send_message(ack, weak->get_connection());
+
+    for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p)
+      mds->locker->eval_gather(*p);
   } else {
     // done?
     assert(rejoin_gather.count(from));
@@ -4055,7 +4059,9 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
  * all validated replicas are acked with a strong nonce, etc.  if that isn't in the
  * ack, the replica dne, and we can remove it from our replica maps.
  */
-void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes)
+void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
+					     set<SimpleLock *>& gather_locks,
+					     set<vinodeno_t>& acked_inodes)
 {
   dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
 
@@ -4070,7 +4076,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set
     if (in->is_auth() &&
 	in->is_replica(from) &&
 	acked_inodes.count(p->second->vino()) == 0) {
-      inode_remove_replica(in, from);
+      inode_remove_replica(in, from, gather_locks);
       dout(10) << " rem " << *in << dendl;
     }
 
@@ -4099,7 +4105,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set
 	if (dn->is_replica(from) &&
 	    (ack->strong_dentries.count(dir->dirfrag()) == 0 ||
 	     ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
-	  dentry_remove_replica(dn, from);
+	  dentry_remove_replica(dn, from, gather_locks);
 	  dout(10) << " rem " << *dn << dendl;
 	}
       }
@@ -6189,6 +6195,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
     return;
   }
 
+  set<SimpleLock *> gather_locks;
   // loop over realms
   for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
        p != m->realms.end();
@@ -6255,7 +6262,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
 	// remove from our cached_by
 	dout(7) << " inode expire on " << *in << " from mds." << from 
 		<< " cached_by was " << in->get_replicas() << dendl;
-	inode_remove_replica(in, from);
+	inode_remove_replica(in, from, gather_locks);
       } 
       else {
 	// this is an old nonce, ignore expire.
@@ -6332,7 +6339,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
 	
 	if (nonce == dn->get_replica_nonce(from)) {
 	  dout(7) << "  dentry_expire on " << *dn << " from mds." << from << dendl;
-	  dentry_remove_replica(dn, from);
+	  dentry_remove_replica(dn, from, gather_locks);
 	} 
 	else {
 	  dout(7) << "  dentry_expire on " << *dn << " from mds." << from
@@ -6343,6 +6350,8 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
     }
   }
 
+  for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p)
+    mds->locker->eval_gather(*p);
 
   // done
   m->put();
@@ -6368,35 +6377,35 @@ void MDCache::discard_delayed_expire(CDir *dir)
   delayed_expire.erase(dir);  
 }
 
-void MDCache::inode_remove_replica(CInode *in, int from)
+void MDCache::inode_remove_replica(CInode *in, int from, set<SimpleLock *>& gather_locks)
 {
   in->remove_replica(from);
   in->mds_caps_wanted.erase(from);
   
   // note: this code calls _eval more often than it needs to!
   // fix lock
-  if (in->authlock.remove_replica(from)) mds->locker->eval_gather(&in->authlock);
-  if (in->linklock.remove_replica(from)) mds->locker->eval_gather(&in->linklock);
-  if (in->dirfragtreelock.remove_replica(from)) mds->locker->eval_gather(&in->dirfragtreelock);
-  if (in->filelock.remove_replica(from)) mds->locker->eval_gather(&in->filelock);
-  if (in->snaplock.remove_replica(from)) mds->locker->eval_gather(&in->snaplock);
-  if (in->xattrlock.remove_replica(from)) mds->locker->eval_gather(&in->xattrlock);
+  if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
+  if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
+  if (in->dirfragtreelock.remove_replica(from)) gather_locks.insert(&in->dirfragtreelock);
+  if (in->filelock.remove_replica(from)) gather_locks.insert(&in->filelock);
+  if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
+  if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
 
-  if (in->nestlock.remove_replica(from)) mds->locker->eval_gather(&in->nestlock);
-  if (in->flocklock.remove_replica(from)) mds->locker->eval_gather(&in->flocklock);
-  if (in->policylock.remove_replica(from)) mds->locker->eval_gather(&in->policylock);
+  if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock);
+  if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
+  if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
 
   // trim?
   maybe_eval_stray(in);
 }
 
-void MDCache::dentry_remove_replica(CDentry *dn, int from)
+void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& gather_locks)
 {
   dn->remove_replica(from);
 
   // fix lock
   if (dn->lock.remove_replica(from))
-    mds->locker->eval_gather(&dn->lock);
+    gather_locks.insert(&dn->lock);
 
   CDentry::linkage_t *dnl = dn->get_projected_linkage();
   if (dnl->is_primary())
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index f07ea74..a9f05c6 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -406,7 +406,9 @@ protected:
   CDir* rejoin_invent_dirfrag(dirfrag_t df);
   bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m);
   void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
-  void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes);
+  void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
+				      set<SimpleLock *>& gather_locks,
+				      set<vinodeno_t>& acked_inodes);
   void handle_cache_rejoin_ack(MMDSCacheRejoin *m);
   void handle_cache_rejoin_purge(MMDSCacheRejoin *m);
   void handle_cache_rejoin_missing(MMDSCacheRejoin *m);
@@ -607,8 +609,8 @@ public:
   }
 protected:
 
-  void inode_remove_replica(CInode *in, int rep);
-  void dentry_remove_replica(CDentry *dn, int rep);
+  void inode_remove_replica(CInode *in, int rep, set<SimpleLock *>& gather_locks);
+  void dentry_remove_replica(CDentry *dn, int rep, set<SimpleLock *>& gather_locks);
 
   void rename_file(CDentry *srcdn, CDentry *destdn);
 
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 10/39] mds: unify slave request waiting
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (8 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 09/39] mds: defer eval gather locks when removing replica Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 22:52   ` Sage Weil
  2013-03-17 14:51 ` [PATCH 11/39] mds: don't delay processing replica buffer in slave request Yan, Zheng
                   ` (30 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

When requesting remote xlock or remote wrlock, the master request is
put into lock object's REMOTEXLOCK waiting queue. The problem is that
remote wrlock's target can be different from lock's auth MDS. When
the lock's auth MDS recovers, MDCache::handle_mds_recovery() may wake
incorrect request. So just unify slave request waiting, dispatch the
master request when receiving slave request reply.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Locker.cc | 49 ++++++++++++++++++++++---------------------------
 src/mds/Server.cc | 12 ++++++++++--
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index d06a9cc..0055a19 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -544,8 +544,6 @@ void Locker::cancel_locking(Mutation *mut, set<CInode*> *pneed_issue)
       if (need_issue)
 	pneed_issue->insert(static_cast<CInode *>(lock->get_parent()));
     }
-  } else {
-    lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
   }
   mut->finish_locking(lock);
 }
@@ -1326,18 +1324,16 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut)
   }
 
   // send lock request
-  if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) {
-    mut->start_locking(lock, target);
-    mut->more()->slaves.insert(target);
-    MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
-					       MMDSSlaveRequest::OP_WRLOCK);
-    r->set_lock_type(lock->get_type());
-    lock->get_parent()->set_object_info(r->get_object_info());
-    mds->send_message_mds(r, target);
-  }
-  
-  // wait
-  lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mut));
+  mut->start_locking(lock, target);
+  mut->more()->slaves.insert(target);
+  MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
+					     MMDSSlaveRequest::OP_WRLOCK);
+  r->set_lock_type(lock->get_type());
+  lock->get_parent()->set_object_info(r->get_object_info());
+  mds->send_message_mds(r, target);
+
+  assert(mut->more()->waiting_on_slave.count(target) == 0);
+  mut->more()->waiting_on_slave.insert(target);
 }
 
 void Locker::remote_wrlock_finish(SimpleLock *lock, int target, Mutation *mut)
@@ -1411,19 +1407,18 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut)
     }
     
     // send lock request
-    if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) {
-      int auth = lock->get_parent()->authority().first;
-      mut->more()->slaves.insert(auth);
-      mut->start_locking(lock, auth);
-      MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
-						 MMDSSlaveRequest::OP_XLOCK);
-      r->set_lock_type(lock->get_type());
-      lock->get_parent()->set_object_info(r->get_object_info());
-      mds->send_message_mds(r, auth);
-    }
-    
-    // wait
-    lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mut));
+    int auth = lock->get_parent()->authority().first;
+    mut->more()->slaves.insert(auth);
+    mut->start_locking(lock, auth);
+    MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
+					       MMDSSlaveRequest::OP_XLOCK);
+    r->set_lock_type(lock->get_type());
+    lock->get_parent()->set_object_info(r->get_object_info());
+    mds->send_message_mds(r, auth);
+
+    assert(mut->more()->waiting_on_slave.count(auth) == 0);
+    mut->more()->waiting_on_slave.insert(auth);
+
     return false;
   }
 }
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 6d0519f..4c4c86b 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -1371,7 +1371,11 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
       mdr->locks.insert(lock);
       mdr->finish_locking(lock);
       lock->get_xlock(mdr, mdr->get_client());
-      lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
+
+      assert(mdr->more()->waiting_on_slave.count(from));
+      mdr->more()->waiting_on_slave.erase(from);
+      assert(mdr->more()->waiting_on_slave.empty());
+      dispatch_client_request(mdr);
     }
     break;
     
@@ -1385,7 +1389,11 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
       mdr->remote_wrlocks[lock] = from;
       mdr->locks.insert(lock);
       mdr->finish_locking(lock);
-      lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
+
+      assert(mdr->more()->waiting_on_slave.count(from));
+      mdr->more()->waiting_on_slave.erase(from);
+      assert(mdr->more()->waiting_on_slave.empty());
+      dispatch_client_request(mdr);
     }
     break;
 
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 11/39] mds: don't delay processing replica buffer in slave request
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (9 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 10/39] mds: unify slave request waiting Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 21:19   ` Greg Farnum
  2013-03-17 14:51 ` [PATCH 12/39] mds: compose and send resolve messages in batch Yan, Zheng
                   ` (29 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Replicated objects need to be added into the cache immediately

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 12 ++++++++++++
 src/mds/MDCache.h  |  2 +-
 src/mds/MDS.cc     |  6 +++---
 src/mds/Server.cc  | 55 +++++++++++++++++++++++++++++++++++++++---------------
 4 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 0f6b842..b668842 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -7722,6 +7722,18 @@ void MDCache::_find_ino_dir(inodeno_t ino, Context *fin, bufferlist& bl, int r)
 
 /* ---------------------------- */
 
+int MDCache::get_num_client_requests()
+{
+  int count = 0;
+  for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
+      p != active_requests.end();
+      ++p) {
+    if (p->second->reqid.name.is_client() && !p->second->is_slave())
+      count++;
+  }
+  return count;
+}
+
 /* This function takes over the reference to the passed Message */
 MDRequest *MDCache::request_start(MClientRequest *req)
 {
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index a9f05c6..4634121 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -240,7 +240,7 @@ protected:
   hash_map<metareqid_t, MDRequest*> active_requests; 
 
 public:
-  int get_num_active_requests() { return active_requests.size(); }
+  int get_num_client_requests();
 
   MDRequest* request_start(MClientRequest *req);
   MDRequest* request_start_slave(metareqid_t rid, __u32 attempt, int by);
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index b91dcbd..e99eecc 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1900,9 +1900,9 @@ bool MDS::_dispatch(Message *m)
       mdcache->is_open() &&
       replay_queue.empty() &&
       want_state == MDSMap::STATE_CLIENTREPLAY) {
-    dout(10) << " still have " << mdcache->get_num_active_requests()
-	     << " active replay requests" << dendl;
-    if (mdcache->get_num_active_requests() == 0)
+    int num_requests = mdcache->get_num_client_requests();
+    dout(10) << " still have " << num_requests << " active replay requests" << dendl;
+    if (num_requests == 0)
       clientreplay_done();
   }
 
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 4c4c86b..8e89e4c 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -107,10 +107,8 @@ void Server::dispatch(Message *m)
 		(m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
 		 (static_cast<MClientRequest*>(m))->is_replay()))) {
       // replaying!
-    } else if (mds->is_clientreplay() && m->get_type() == MSG_MDS_SLAVE_REQUEST &&
-	       ((static_cast<MMDSSlaveRequest*>(m))->is_reply() ||
-		!mds->mdsmap->is_active(m->get_source().num()))) {
-      // slave reply or the master is also in the clientreplay stage
+    } else if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
+      // handle_slave_request() will wait if necessary
     } else {
       dout(3) << "not active yet, waiting" << dendl;
       mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
@@ -1291,6 +1289,13 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
   if (m->is_reply())
     return handle_slave_request_reply(m);
 
+  CDentry *straydn = NULL;
+  if (m->stray.length() > 0) {
+    straydn = mdcache->add_replica_stray(m->stray, from);
+    assert(straydn);
+    m->stray.clear();
+  }
+
   // am i a new slave?
   MDRequest *mdr = NULL;
   if (mdcache->have_request(m->get_reqid())) {
@@ -1326,9 +1331,26 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
       m->put();
       return;
     }
-    mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m->get_source().num());
+    mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), from);
   }
   assert(mdr->slave_request == 0);     // only one at a time, please!  
+
+  if (straydn) {
+    mdr->pin(straydn);
+    mdr->straydn = straydn;
+  }
+
+  if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+    dout(3) << "not clientreplay|active yet, waiting" << dendl;
+    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+    return;
+  } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
+	     mdr->locks.empty()) {
+    dout(3) << "not active yet, waiting" << dendl;
+    mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
   mdr->slave_request = m;
   
   dispatch_slave_request(mdr);
@@ -1339,6 +1361,12 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
 {
   int from = m->get_source().num();
   
+  if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+    dout(3) << "not clientreplay|active yet, waiting" << dendl;
+    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
   if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
     metareqid_t r = m->get_reqid();
     mds->mdcache->committed_master_slave(r, from);
@@ -5138,10 +5166,8 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr)
   dout(10) << " dn " << *dn << dendl;
   mdr->pin(dn);
 
-  assert(mdr->slave_request->stray.length() > 0);
-  CDentry *straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
-  assert(straydn);
-  mdr->pin(straydn);
+  assert(mdr->straydn);
+  CDentry *straydn = mdr->straydn;
   dout(10) << " straydn " << *straydn << dendl;
   
   mdr->now = mdr->slave_request->now;
@@ -5208,6 +5234,7 @@ void Server::_logged_slave_rmdir(MDRequest *mdr, CDentry *dn, CDentry *straydn)
   // done.
   mdr->slave_request->put();
   mdr->slave_request = 0;
+  mdr->straydn = 0;
 }
 
 void Server::handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
@@ -6460,15 +6487,12 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
   // stray?
   bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
 		    (srcdnl->is_primary() || destdnl->is_primary()));
-  CDentry *straydn = 0;
-  if (destdnl->is_primary() && !linkmerge) {
-    assert(mdr->slave_request->stray.length() > 0);
-    straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
+  CDentry *straydn = mdr->straydn;
+  if (destdnl->is_primary() && !linkmerge)
     assert(straydn);
-    mdr->pin(straydn);
-  }
 
   mdr->now = mdr->slave_request->now;
+  mdr->more()->srcdn_auth_mds = srcdn->authority().first;
 
   // set up commit waiter (early, to clean up any freezing etc we do)
   if (!mdr->more()->slave_commit)
@@ -6651,6 +6675,7 @@ void Server::_logged_slave_rename(MDRequest *mdr,
   // done.
   mdr->slave_request->put();
   mdr->slave_request = 0;
+  mdr->straydn = 0;
 }
 
 void Server::_commit_slave_rename(MDRequest *mdr, int r,
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 12/39] mds: compose and send resolve messages in batch
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (10 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 11/39] mds: don't delay processing replica buffer in slave request Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 21:45   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 13/39] mds: don't send resolve message between active MDS Yan, Zheng
                   ` (28 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Resolve messages for all MDS are the same, so we can compose and
send them in batch.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 181 +++++++++++++++++++++++++----------------------------
 src/mds/MDCache.h  |  11 ++--
 2 files changed, 93 insertions(+), 99 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index b668842..c455a20 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2432,10 +2432,6 @@ void MDCache::resolve_start()
     if (rootdir)
       adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
   }
-
-  for (map<int, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
-       p != uncommitted_slave_updates.end(); ++p)
-    need_resolve_ack.insert(p->first);
 }
 
 void MDCache::send_resolves()
@@ -2444,9 +2440,10 @@ void MDCache::send_resolves()
   got_resolve.clear();
   other_ambiguous_imports.clear();
 
-  if (!need_resolve_ack.empty()) {
-    for (set<int>::iterator p = need_resolve_ack.begin(); p != need_resolve_ack.end(); ++p)
-      send_slave_resolve(*p);
+  send_slave_resolves();
+  if (!resolve_ack_gather.empty()) {
+    dout(10) << "send_resolves still waiting for resolve ack from ("
+             << need_resolve_ack << ")" << dendl;
     return;
   }
   if (!need_resolve_rollback.empty()) {
@@ -2454,95 +2451,74 @@ void MDCache::send_resolves()
 	     << need_resolve_rollback << ")" << dendl;
     return;
   }
-  assert(uncommitted_slave_updates.empty());
-  for (set<int>::iterator p = recovery_set.begin(); p != recovery_set.end(); ++p) {
-    int who = *p;
-    if (who == mds->whoami)
-      continue;
-    if (migrator->is_importing() ||
-	migrator->is_exporting())
-      send_resolve_later(who);
-    else
-      send_resolve_now(who);
-  }
-}
-
-void MDCache::send_resolve_later(int who)
-{
-  dout(10) << "send_resolve_later to mds." << who << dendl;
-  wants_resolve.insert(who);
+  send_subtree_resolves();
 }
 
-void MDCache::maybe_send_pending_resolves()
+void MDCache::send_slave_resolves()
 {
-  if (wants_resolve.empty())
-    return;  // nothing to send.
-
-  // only if it's appropriate!
-  if (migrator->is_exporting() ||
-      migrator->is_importing()) {
-    dout(7) << "maybe_send_pending_resolves waiting, imports/exports still in progress" << dendl;
-    migrator->show_importing();
-    migrator->show_exporting();
-    return;  // not now
-  }
-  
-  // ok, send them.
-  for (set<int>::iterator p = wants_resolve.begin();
-       p != wants_resolve.end();
-       ++p) 
-    send_resolve_now(*p);
-  wants_resolve.clear();
-}
+  dout(10) << "send_slave_resolves" << dendl;
 
+  map<int, MMDSResolve*> resolves;
 
-class C_MDC_SendResolve : public Context {
-  MDCache *mdc;
-  int who;
-public:
-  C_MDC_SendResolve(MDCache *c, int w) : mdc(c), who(w) { }
-  void finish(int r) {
-    mdc->send_resolve_now(who);
-  }
-};
-
-void MDCache::send_slave_resolve(int who)
-{
-  dout(10) << "send_slave_resolve to mds." << who << dendl;
-  MMDSResolve *m = new MMDSResolve;
-
-  // list prepare requests lacking a commit
-  // [active survivor]
-  for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
-      p != active_requests.end();
-      ++p) {
-    if (p->second->is_slave() && p->second->slave_to_mds == who) {
-      dout(10) << " including uncommitted " << *p->second << dendl;
-      m->add_slave_request(p->first);
+  if (mds->is_resolve()) {
+    for (map<int, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
+	 p != uncommitted_slave_updates.end();
+	 ++p) {
+      resolves[p->first] = new MMDSResolve;
+      for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
+	   q != p->second.end();
+	   ++q) {
+	dout(10) << " including uncommitted " << q->first << dendl;
+	resolves[p->first]->add_slave_request(q->first);
+      }
     }
-  }
-  // [resolving]
-  if (uncommitted_slave_updates.count(who) &&
-      !uncommitted_slave_updates[who].empty()) {
-    for (map<metareqid_t, MDSlaveUpdate*>::iterator p = uncommitted_slave_updates[who].begin();
-	p != uncommitted_slave_updates[who].end();
-	++p) {
-      dout(10) << " including uncommitted " << p->first << dendl;
-      m->add_slave_request(p->first);
+  } else {
+    set<int> resolve_set;
+    mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
+    for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
+	 p != active_requests.end();
+	 ++p) {
+      if (!p->second->is_slave() || !p->second->slave_did_prepare())
+	continue;
+      int master = p->second->slave_to_mds;
+      if (resolve_set.count(master)) {
+	dout(10) << " including uncommitted " << *p->second << dendl;
+	if (!resolves.count(master))
+	  resolves[master] = new MMDSResolve;
+	resolves[master]->add_slave_request(p->first);
+      }
     }
   }
 
-  assert(!m->slave_requests.empty());
-  dout(10) << " will need resolve ack from mds." << who << dendl;
-  mds->send_message_mds(m, who);
+  for (map<int, MMDSResolve*>::iterator p = resolves.begin();
+       p != resolves.end();
+       ++p) {
+    dout(10) << "sending slave resolve to mds." << p->first << dendl;
+    mds->send_message_mds(p->second, p->first);
+    need_resolve_ack.insert(p->first);
+  }
 }
 
-void MDCache::send_resolve_now(int who)
+void MDCache::send_subtree_resolves()
 {
-  dout(10) << "send_resolve_now to mds." << who << dendl;
-  MMDSResolve *m = new MMDSResolve;
+  dout(10) << "send_subtree_resolves" << dendl;
 
-  show_subtrees();
+  if (migrator->is_exporting() || migrator->is_importing()) {
+    dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
+    migrator->show_importing();
+    migrator->show_exporting();
+    resolves_pending = true;
+    return;  // not now
+  }
+
+  map<int, MMDSResolve*> resolves;
+  for (set<int>::iterator p = recovery_set.begin();
+       p != recovery_set.end();
+       ++p) {
+    if (*p == mds->whoami)
+      continue;
+    resolves[*p] = new MMDSResolve;
+  }
 
   // known
   for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
@@ -2562,22 +2538,30 @@ void MDCache::send_resolve_now(int who)
       set<CDir*> bounds;
       get_subtree_bounds(dir, bounds);
       vector<dirfrag_t> dfls;
-      for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
-	dfls.push_back((*p)->dirfrag());
-      m->add_ambiguous_import(dir->dirfrag(), dfls);
+      for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
+	dfls.push_back((*q)->dirfrag());
+      for (map<int, MMDSResolve*>::iterator q = resolves.begin();
+	   q != resolves.end();
+	   ++q)
+	resolves[q->first]->add_ambiguous_import(dir->dirfrag(), dfls);
       dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
     } else {
       // not ambiguous.
-      m->add_subtree(dir->dirfrag());
-      
+      for (map<int, MMDSResolve*>::iterator q = resolves.begin();
+	   q != resolves.end();
+	   ++q)
+	resolves[q->first]->add_subtree(dir->dirfrag());
       // bounds too
       vector<dirfrag_t> dfls;
       for (set<CDir*>::iterator q = subtrees[dir].begin();
 	   q != subtrees[dir].end();
 	   ++q) {
 	CDir *bound = *q;
-	m->add_subtree_bound(dir->dirfrag(), bound->dirfrag());
 	dfls.push_back(bound->dirfrag());
+	for (map<int, MMDSResolve*>::iterator r = resolves.begin();
+	     r != resolves.end();
+	     ++r)
+	  resolves[r->first]->add_subtree_bound(dir->dirfrag(), bound->dirfrag());
       }
       dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
     }
@@ -2587,15 +2571,23 @@ void MDCache::send_resolve_now(int who)
   for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
        p != my_ambiguous_imports.end();
        ++p) {
-    m->add_ambiguous_import(p->first, p->second);
+    for (map<int, MMDSResolve*>::iterator q = resolves.begin();
+	 q != resolves.end();
+	 ++q)
+      resolves[q->first]->add_ambiguous_import(p->first, p->second);
     dout(10) << " ambig " << p->first << " " << p->second << dendl;
   }
 
   // send
-  mds->send_message_mds(m, who);
+  for (map<int, MMDSResolve*>::iterator p = resolves.begin();
+       p != resolves.end();
+       ++p) {
+    dout(10) << "sending subtee resolve to mds." << p->first << dendl;
+    mds->send_message_mds(p->second, p->first);
+  }
+  resolves_pending = false;
 }
 
-
 void MDCache::handle_mds_failure(int who)
 {
   dout(7) << "handle_mds_failure mds." << who << dendl;
@@ -2631,7 +2623,6 @@ void MDCache::handle_mds_failure(int who)
     // slave to the failed node?
     if (p->second->slave_to_mds == who) {
       if (p->second->slave_did_prepare()) {
-	need_resolve_ack.insert(who);
 	dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
       } else {
 	dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl;
@@ -3011,7 +3002,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
 
   need_resolve_ack.erase(from);
   if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
-    send_resolves();
+    send_subtree_resolves();
     process_delayed_resolve();
   }
 
@@ -3078,7 +3069,7 @@ void MDCache::finish_rollback(metareqid_t reqid) {
     finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
   need_resolve_rollback.erase(reqid);
   if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
-    send_resolves();
+    send_subtree_resolves();
     process_delayed_resolve();
   }
 }
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 4634121..10e3dd7 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -328,6 +328,7 @@ protected:
   friend class ESlaveUpdate;
   friend class ECommitted;
 
+  bool resolves_pending;
   set<int> wants_resolve;   // nodes i need to send my resolve to
   set<int> got_resolve;     // nodes i got resolves from
   set<int> need_resolve_ack;   // nodes i need a resolve_ack from
@@ -367,10 +368,12 @@ public:
   void finish_ambiguous_import(dirfrag_t dirino);
   void resolve_start();
   void send_resolves();
-  void send_slave_resolve(int who);
-  void send_resolve_now(int who);
-  void send_resolve_later(int who);
-  void maybe_send_pending_resolves();
+  void send_slave_resolves();
+  void send_subtree_resolves();
+  void maybe_send_pending_resolves() {
+    if (resolves_pending)
+      send_subtree_resolves();
+  }
   
   void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
 			       map<dirfrag_t,vector<dirfrag_t> >& subtrees);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 13/39] mds: don't send resolve message between active MDS
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (11 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 12/39] mds: compose and send resolve messages in batch Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 21:56   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 14/39] mds: set resolve/rejoin gather MDS set in advance Yan, Zheng
                   ` (27 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

When MDS cluster is resolving, current behavior is sending subtree resolve
message to all other MDS and waiting for all other MDS' resolve message.
The problem is that active MDS can have diffent subtree map due to rename.
Besides gathering active MDS's resolve messages are also racy. The only
function for these messages is disambiguate other MDS' import. We can
replace it by import finish notification.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc  | 12 +++++++++---
 src/mds/Migrator.cc | 25 +++++++++++++++++++++++--
 src/mds/Migrator.h  |  3 ++-
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index c455a20..73c1d59 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2517,7 +2517,8 @@ void MDCache::send_subtree_resolves()
        ++p) {
     if (*p == mds->whoami)
       continue;
-    resolves[*p] = new MMDSResolve;
+    if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
+      resolves[*p] = new MMDSResolve;
   }
 
   // known
@@ -2837,7 +2838,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
 	  migrator->import_reverse(dir);
 	} else {
 	  dout(7) << "ambiguous import succeeded on " << *dir << dendl;
-	  migrator->import_finish(dir);
+	  migrator->import_finish(dir, true);
 	}
 	my_ambiguous_imports.erase(p);  // no longer ambiguous.
       }
@@ -3432,7 +3433,12 @@ void MDCache::rejoin_send_rejoins()
        ++p) {
     CDir *dir = p->first;
     assert(dir->is_subtree_root());
-    assert(!dir->is_ambiguous_dir_auth());
+    if (dir->is_ambiguous_dir_auth()) {
+      // exporter is recovering, importer is survivor.
+      assert(rejoins.count(dir->authority().first));
+      assert(!rejoins.count(dir->authority().second));
+      continue;
+    }
 
     // my subtree?
     if (dir->is_auth())
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 5e53803..833df12 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -2088,6 +2088,23 @@ void Migrator::import_reverse(CDir *dir)
   }
 }
 
+void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
+{
+  dout(7) << "import_notify_finish " << *dir << dendl;
+
+  for (set<int>::iterator p = import_bystanders[dir].begin();
+       p != import_bystanders[dir].end();
+       ++p) {
+    MExportDirNotify *notify =
+      new MExportDirNotify(dir->dirfrag(), false,
+			   pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
+			   pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
+    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
+      notify->get_bounds().push_back((*i)->dirfrag());
+    mds->send_message_mds(notify, *p);
+  }
+}
+
 void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
 {
   dout(7) << "import_notify_abort " << *dir << dendl;
@@ -2183,11 +2200,11 @@ void Migrator::handle_export_finish(MExportDirFinish *m)
   CDir *dir = cache->get_dirfrag(m->get_dirfrag());
   assert(dir);
   dout(7) << "handle_export_finish on " << *dir << dendl;
-  import_finish(dir);
+  import_finish(dir, false);
   m->put();
 }
 
-void Migrator::import_finish(CDir *dir) 
+void Migrator::import_finish(CDir *dir, bool notify)
 {
   dout(7) << "import_finish on " << *dir << dendl;
 
@@ -2205,6 +2222,10 @@ void Migrator::import_finish(CDir *dir)
   // remove pins
   set<CDir*> bounds;
   cache->get_subtree_bounds(dir, bounds);
+
+  if (notify)
+    import_notify_finish(dir, bounds);
+
   import_remove_pins(dir, bounds);
 
   map<CInode*, map<client_t,Capability::Export> > cap_imports;
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index 7988f32..2889a74 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -273,12 +273,13 @@ protected:
   void import_reverse_unfreeze(CDir *dir);
   void import_reverse_final(CDir *dir);
   void import_notify_abort(CDir *dir, set<CDir*>& bounds);
+  void import_notify_finish(CDir *dir, set<CDir*>& bounds);
   void import_logged_start(dirfrag_t df, CDir *dir, int from,
 			   map<client_t,entity_inst_t> &imported_client_map,
 			   map<client_t,uint64_t>& sseqmap);
   void handle_export_finish(MExportDirFinish *m);
 public:
-  void import_finish(CDir *dir);
+  void import_finish(CDir *dir, bool notify);
 protected:
 
   void handle_export_caps(MExportCaps *m);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 14/39] mds: set resolve/rejoin gather MDS set in advance
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (12 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 13/39] mds: don't send resolve message between active MDS Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 22:09   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 15/39] mds: don't send MDentry{Link,Unlink} before receiving cache rejoin Yan, Zheng
                   ` (26 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

For active MDS, it may receive resolve/resolve message before receiving
the mdsmap message that claims the MDS cluster is in resolving/rejoning
state. So instead of set the gather MDS set when receiving the mdsmap.
set them in advance when detecting MDS' failure.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 41 +++++++++++++++++++----------------------
 src/mds/MDCache.h  |  5 ++---
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 73c1d59..69db1dd 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2432,18 +2432,17 @@ void MDCache::resolve_start()
     if (rootdir)
       adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
   }
+  resolve_gather = recovery_set;
+  resolve_gather.erase(mds->get_nodeid());
+  rejoin_gather = resolve_gather;
 }
 
 void MDCache::send_resolves()
 {
-  // reset resolve state
-  got_resolve.clear();
-  other_ambiguous_imports.clear();
-
   send_slave_resolves();
   if (!resolve_ack_gather.empty()) {
     dout(10) << "send_resolves still waiting for resolve ack from ("
-             << need_resolve_ack << ")" << dendl;
+	     << resolve_ack_gather << ")" << dendl;
     return;
   }
   if (!need_resolve_rollback.empty()) {
@@ -2495,7 +2494,7 @@ void MDCache::send_slave_resolves()
        ++p) {
     dout(10) << "sending slave resolve to mds." << p->first << dendl;
     mds->send_message_mds(p->second, p->first);
-    need_resolve_ack.insert(p->first);
+    resolve_ack_gather.insert(p->first);
   }
 }
 
@@ -2598,16 +2597,15 @@ void MDCache::handle_mds_failure(int who)
   recovery_set.erase(mds->get_nodeid());
   dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
 
-  // adjust my recovery lists
-  wants_resolve.erase(who);   // MDS will ask again
-  got_resolve.erase(who);     // i'll get another.
+  resolve_gather.insert(who);
   discard_delayed_resolve(who);
 
+  rejoin_gather.insert(who);
   rejoin_sent.erase(who);        // i need to send another
   rejoin_ack_gather.erase(who);  // i'll need/get another.
 
-  dout(10) << " wants_resolve " << wants_resolve << dendl;
-  dout(10) << " got_resolve " << got_resolve << dendl;
+  dout(10) << " resolve_gather " << resolve_gather << dendl;
+  dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
   dout(10) << " rejoin_sent " << rejoin_sent << dendl;
   dout(10) << " rejoin_gather " << rejoin_gather << dendl;
   dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
@@ -2788,7 +2786,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
     return;
   }
 
-  if (!need_resolve_ack.empty() || !need_resolve_rollback.empty()) {
+  if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
     dout(10) << "delay processing subtree resolve" << dendl;
     discard_delayed_resolve(from);
     delayed_resolve[from] = m;
@@ -2875,7 +2873,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
   }
   
   // did i get them all?
-  got_resolve.insert(from);
+  resolve_gather.erase(from);
   
   maybe_resolve_finish();
 
@@ -2901,12 +2899,12 @@ void MDCache::discard_delayed_resolve(int who)
 
 void MDCache::maybe_resolve_finish()
 {
-  assert(need_resolve_ack.empty());
+  assert(resolve_ack_gather.empty());
   assert(need_resolve_rollback.empty());
 
-  if (got_resolve != recovery_set) {
-    dout(10) << "maybe_resolve_finish still waiting for more resolves, got (" 
-	     << got_resolve << "), need (" << recovery_set << ")" << dendl;
+  if (!resolve_gather.empty()) {
+    dout(10) << "maybe_resolve_finish still waiting for resolves ("
+	     << resolve_gather << ")" << dendl;
     return;
   } else {
     dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
@@ -2926,7 +2924,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
   dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
   int from = ack->get_source().num();
 
-  if (!need_resolve_ack.count(from)) {
+  if (!resolve_ack_gather.count(from)) {
     ack->put();
     return;
   }
@@ -3001,8 +2999,8 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
       assert(p->second->slave_to_mds != from);
   }
 
-  need_resolve_ack.erase(from);
-  if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
+  resolve_ack_gather.erase(from);
+  if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
     send_subtree_resolves();
     process_delayed_resolve();
   }
@@ -3069,7 +3067,7 @@ void MDCache::finish_rollback(metareqid_t reqid) {
   if (mds->is_resolve())
     finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
   need_resolve_rollback.erase(reqid);
-  if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
+  if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
     send_subtree_resolves();
     process_delayed_resolve();
   }
@@ -3417,7 +3415,6 @@ void MDCache::rejoin_send_rejoins()
     if (*p == mds->get_nodeid())  continue;  // nothing to myself!
     if (rejoin_sent.count(*p)) continue;     // already sent a rejoin to this node!
     if (mds->is_rejoin()) {
-      rejoin_gather.insert(*p);
       rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
       rejoins[*p]->copy_cap_exports(cap_export_bl);
     } else if (mds->mdsmap->is_rejoin(*p))
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 10e3dd7..278debf 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -329,9 +329,8 @@ protected:
   friend class ECommitted;
 
   bool resolves_pending;
-  set<int> wants_resolve;   // nodes i need to send my resolve to
-  set<int> got_resolve;     // nodes i got resolves from
-  set<int> need_resolve_ack;   // nodes i need a resolve_ack from
+  set<int> resolve_gather;	// nodes i need resolves from
+  set<int> resolve_ack_gather;	// nodes i need a resolve_ack from
   map<metareqid_t, int> need_resolve_rollback;  // rollbacks i'm writing to the journal
   map<int, MMDSResolve*> delayed_resolve;
   
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 15/39] mds: don't send MDentry{Link,Unlink} before receiving cache rejoin
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (13 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 14/39] mds: set resolve/rejoin gather MDS set in advance Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 22:17   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 16/39] mds: send cache rejoin messages after gathering all resolves Yan, Zheng
                   ` (25 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

The active MDS calls MDCache::rejoin_scour_survivor_replicas() when it
receives the cache rejoin message. The function will remove the objects
replicated by MDentry{Link,Unlink} from replica map.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 69db1dd..f102205 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3893,6 +3893,8 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
     }
   }
 
+  assert(rejoin_gather.count(from));
+  rejoin_gather.erase(from);
   if (survivor) {
     // survivor.  do everything now.
     for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
@@ -3911,8 +3913,6 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
       mds->locker->eval_gather(*p);
   } else {
     // done?
-    assert(rejoin_gather.count(from));
-    rejoin_gather.erase(from);
     if (rejoin_gather.empty()) {
       rejoin_gather_finish();
     } else {
@@ -9582,7 +9582,9 @@ void MDCache::send_dentry_link(CDentry *dn)
   for (map<int,int>::iterator p = dn->replicas_begin(); 
        p != dn->replicas_end(); 
        ++p) {
-    if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN) 
+    if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
+	(mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
+	 rejoin_gather.count(p->first)))
       continue;
     CDentry::linkage_t *dnl = dn->get_linkage();
     MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
@@ -9668,6 +9670,11 @@ void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequest *mdr)
     if (mdr && mdr->more()->witnessed.count(it->first))
       continue;
 
+    if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
+	(mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
+	 rejoin_gather.count(it->first)))
+      continue;
+
     MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name);
     if (straydn)
       replicate_stray(straydn, it->first, unlink->straybl);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 16/39] mds: send cache rejoin messages after gathering all resolves
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (14 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 15/39] mds: don't send MDentry{Link,Unlink} before receiving cache rejoin Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 22:57   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 17/39] mds: send resolve acks after master updates are safely logged Yan, Zheng
                   ` (24 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 10 ++++++++++
 src/mds/MDCache.h  |  5 +++++
 2 files changed, 15 insertions(+)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index f102205..6853bf1 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2914,6 +2914,8 @@ void MDCache::maybe_resolve_finish()
       recalc_auth_bits();
       trim_non_auth(); 
       mds->resolve_done();
+    } else {
+      maybe_send_pending_rejoins();
     }
   }
 }
@@ -3398,6 +3400,13 @@ void MDCache::rejoin_send_rejoins()
 {
   dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
 
+  if (!resolve_gather.empty()) {
+    dout(7) << "rejoin_send_rejoins still waiting for resolves ("
+	    << resolve_gather << ")" << dendl;
+    rejoins_pending = true;
+    return;
+  }
+
   map<int, MMDSCacheRejoin*> rejoins;
 
   // encode cap list once.
@@ -3571,6 +3580,7 @@ void MDCache::rejoin_send_rejoins()
     mds->send_message_mds(p->second, p->first);
   }
   rejoin_ack_gather.insert(mds->whoami);   // we need to complete rejoin_gather_finish, too
+  rejoins_pending = false;
 
   // nothing?
   if (mds->is_rejoin() && rejoins.empty()) {
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 278debf..379f715 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -383,6 +383,7 @@ public:
 
 protected:
   // [rejoin]
+  bool rejoins_pending;
   set<int> rejoin_gather;      // nodes from whom i need a rejoin
   set<int> rejoin_sent;        // nodes i sent a rejoin to
   set<int> rejoin_ack_gather;  // nodes from whom i need a rejoin ack
@@ -417,6 +418,10 @@ protected:
   void handle_cache_rejoin_full(MMDSCacheRejoin *m);
   void rejoin_send_acks();
   void rejoin_trim_undef_inodes();
+  void maybe_send_pending_rejoins() {
+    if (rejoins_pending)
+      rejoin_send_rejoins();
+  }
 public:
   void rejoin_gather_finish();
   void rejoin_send_rejoins();
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 17/39] mds: send resolve acks after master updates are safely logged
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (15 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 16/39] mds: send cache rejoin messages after gathering all resolves Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 22:58   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 18/39] mds: fix MDS recovery involving cross authority rename Yan, Zheng
                   ` (23 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 33 +++++++++++++++++++++++++++++----
 src/mds/MDCache.h  |  7 ++++++-
 src/mds/Server.cc  |  9 +++++++++
 src/mds/journal.cc |  2 +-
 4 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 6853bf1..9b37b1e 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2177,6 +2177,17 @@ void MDCache::committed_master_slave(metareqid_t r, int from)
     log_master_commit(r);
 }
 
+void MDCache::logged_master_update(metareqid_t reqid)
+{
+  dout(10) << "logged_master_update " << reqid << dendl;
+  assert(uncommitted_masters.count(reqid));
+  uncommitted_masters[reqid].safe = true;
+  if (pending_masters.count(reqid)) {
+    pending_masters.erase(reqid);
+    if (pending_masters.empty())
+      process_delayed_resolve();
+  }
+}
 
 /*
  * The mds could crash after receiving all slaves' commit acknowledgement,
@@ -2764,8 +2775,23 @@ void MDCache::handle_resolve(MMDSResolve *m)
     return;
   }
 
+  discard_delayed_resolve(from);
+
   // ambiguous slave requests?
   if (!m->slave_requests.empty()) {
+    for (vector<metareqid_t>::iterator p = m->slave_requests.begin();
+	 p != m->slave_requests.end();
+	 ++p) {
+      if (uncommitted_masters.count(*p) && !uncommitted_masters[*p].safe)
+	pending_masters.insert(*p);
+    }
+
+    if (!pending_masters.empty()) {
+      dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
+      delayed_resolve[from] = m;
+      return;
+    }
+
     MMDSResolveAck *ack = new MMDSResolveAck;
     for (vector<metareqid_t>::iterator p = m->slave_requests.begin();
 	 p != m->slave_requests.end();
@@ -2788,7 +2814,6 @@ void MDCache::handle_resolve(MMDSResolve *m)
 
   if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
     dout(10) << "delay processing subtree resolve" << dendl;
-    discard_delayed_resolve(from);
     delayed_resolve[from] = m;
     return;
   }
@@ -2883,10 +2908,10 @@ void MDCache::handle_resolve(MMDSResolve *m)
 void MDCache::process_delayed_resolve()
 {
   dout(10) << "process_delayed_resolve" << dendl;
-  for (map<int, MMDSResolve *>::iterator p = delayed_resolve.begin();
-       p != delayed_resolve.end(); ++p)
+  map<int, MMDSResolve*> tmp;
+  tmp.swap(delayed_resolve);
+  for (map<int, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
     handle_resolve(p->second);
-  delayed_resolve.clear();
 }
 
 void MDCache::discard_delayed_resolve(int who)
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 379f715..8f262b9 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -281,14 +281,16 @@ public:
 				snapid_t follows=CEPH_NOSNAP);
 
   // slaves
-  void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<int> &slaves) {
+  void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<int> &slaves, bool safe=false) {
     uncommitted_masters[reqid].ls = ls;
     uncommitted_masters[reqid].slaves = slaves;
+    uncommitted_masters[reqid].safe = safe;
   }
   void wait_for_uncommitted_master(metareqid_t reqid, Context *c) {
     uncommitted_masters[reqid].waiters.push_back(c);
   }
   void log_master_commit(metareqid_t reqid);
+  void logged_master_update(metareqid_t reqid);
   void _logged_master_commit(metareqid_t reqid, LogSegment *ls, list<Context*> &waiters);
   void committed_master_slave(metareqid_t r, int from);
   void finish_committed_masters();
@@ -320,9 +322,12 @@ protected:
     set<int> slaves;
     LogSegment *ls;
     list<Context*> waiters;
+    bool safe;
   };
   map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
 
+  set<metareqid_t>		pending_masters;
+
   //map<metareqid_t, bool>     ambiguous_slave_updates;         // for log trimming.
   //map<metareqid_t, Context*> waiting_for_slave_update_commit;
   friend class ESlaveUpdate;
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 8e89e4c..1330f11 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4463,6 +4463,9 @@ void Server::_link_remote_finish(MDRequest *mdr, bool inc,
 
   assert(g_conf->mds_kill_link_at != 3);
 
+  if (!mdr->more()->witnessed.empty())
+    mdcache->logged_master_update(mdr->reqid);
+
   if (inc) {
     // link the new dentry
     dn->pop_projected_linkage();
@@ -5073,6 +5076,9 @@ void Server::_unlink_local_finish(MDRequest *mdr,
 {
   dout(10) << "_unlink_local_finish " << *dn << dendl;
 
+  if (!mdr->more()->witnessed.empty())
+    mdcache->logged_master_update(mdr->reqid);
+
   // unlink main dentry
   dn->get_dir()->unlink_inode(dn);
   dn->pop_projected_linkage();
@@ -5881,6 +5887,9 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe
 {
   dout(10) << "_rename_finish " << *mdr << dendl;
 
+  if (!mdr->more()->witnessed.empty())
+    mdcache->logged_master_update(mdr->reqid);
+
   // apply
   _rename_apply(mdr, srcdn, destdn, straydn);
 
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 3375e40..6475eec 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -1738,7 +1738,7 @@ void EUpdate::replay(MDS *mds)
     dout(10) << "EUpdate.replay " << reqid << " had slaves, expecting a matching ECommitted" << dendl;
     _segment->uncommitted_masters.insert(reqid);
     set<int> slaves;
-    mds->mdcache->add_uncommitted_master(reqid, _segment, slaves);
+    mds->mdcache->add_uncommitted_master(reqid, _segment, slaves, true);
   }
   
   if (client_map.length()) {
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 18/39] mds: fix MDS recovery involving cross authority rename
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (16 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 17/39] mds: send resolve acks after master updates are safely logged Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21 17:59   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 19/39] mds: remove MDCache::rejoin_fetch_dirfrags() Yan, Zheng
                   ` (22 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

For mds cluster, rename operation may involve multiple MDS. If the
rename source's auth MDS crashes after some witness MDS have prepared
the rename but before the rename is committing. Later when the MDS
recovers, its subtree map and linkages are different from the prepared
MDS'. This causes problems for both subtree resolve and cache rejoin.
The solution is, if the rename source's auth MDS fails, the prepared
witness MDS query the master MDS if the operation is committing. If
it's not, rollback the rename, then send resolve message to the
recovering MDS.

Another similar case is a prepared witness MDS crashes when the
rename source's auth MDS has prepared or is preparing the operation.
when the witness recovers, the master just delay sending the resolve
ack message until the it commits the operation.

This patch also updates Server::handle_client_rename(). Make preparing
the rename source's auth MDS be the final step before committing the
rename.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc |  75 +++++++++++++++++++++++++++++-----------
 src/mds/MDCache.h  |  17 +++++++--
 src/mds/Mutation.h |   2 ++
 src/mds/Server.cc  | 100 ++++++++++++++++++++++++++++-------------------------
 4 files changed, 124 insertions(+), 70 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 9b37b1e..d934020 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2491,7 +2491,7 @@ void MDCache::send_slave_resolves()
       if (!p->second->is_slave() || !p->second->slave_did_prepare())
 	continue;
       int master = p->second->slave_to_mds;
-      if (resolve_set.count(master)) {
+      if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
 	dout(10) << " including uncommitted " << *p->second << dendl;
 	if (!resolves.count(master))
 	  resolves[master] = new MMDSResolve;
@@ -2610,6 +2610,7 @@ void MDCache::handle_mds_failure(int who)
 
   resolve_gather.insert(who);
   discard_delayed_resolve(who);
+  ambiguous_slave_updates.erase(who);
 
   rejoin_gather.insert(who);
   rejoin_sent.erase(who);        // i need to send another
@@ -2642,14 +2643,46 @@ void MDCache::handle_mds_failure(int who)
 	  finish.push_back(p->second);
       }
     }
+
+    if (p->second->is_slave() &&
+	p->second->slave_did_prepare() && p->second->more()->srcdn_auth_mds == who &&
+	mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
+      // rename srcdn's auth mds failed, resolve even I'm a survivor.
+      dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
+      add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
+    }
     
     // failed node is slave?
     if (p->second->is_master() && !p->second->committing) {
+      if (p->second->more()->srcdn_auth_mds == who) {
+	dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
+		 << who << " to recover" << dendl;
+	assert(p->second->more()->witnessed.count(who) == 0);
+	if (p->second->more()->is_ambiguous_auth)
+	  p->second->clear_ambiguous_auth();
+	// rename srcdn's auth mds failed, all witnesses will rollback
+	p->second->more()->witnessed.clear();
+	pending_masters.erase(p->first);
+      }
+
       if (p->second->more()->witnessed.count(who)) {
-	dout(10) << " master request " << *p->second << " no longer witnessed by slave mds." << who
-		 << dendl;
-	// discard this peer's prepare (if any)
-	p->second->more()->witnessed.erase(who);
+	int srcdn_auth = p->second->more()->srcdn_auth_mds;
+	if (srcdn_auth >= 0 && p->second->more()->waiting_on_slave.count(srcdn_auth)) {
+	  dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
+		   << p->second->more()->srcdn_auth_mds << " to reply" << dendl;
+	  // waiting for the last slave (rename srcdn's auth mds), delay sending resolve ack
+	  // until either the request is committing or the last slave also fails.
+	  assert(p->second->more()->waiting_on_slave.size() == 1);
+	  pending_masters.insert(p->first);
+	} else {
+	  dout(10) << " master request " << *p->second << " no longer witnessed by slave mds."
+		   << who << " to recover" << dendl;
+	  if (srcdn_auth >= 0)
+	    assert(p->second->more()->witnessed.count(srcdn_auth) == 0);
+
+	  // discard this peer's prepare (if any)
+	  p->second->more()->witnessed.erase(who);
+	}
       }
       
       if (p->second->more()->waiting_on_slave.count(who)) {
@@ -2657,14 +2690,8 @@ void MDCache::handle_mds_failure(int who)
 		 << " to recover" << dendl;
 	// retry request when peer recovers
 	p->second->more()->waiting_on_slave.erase(who);
-	mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
-      }
-
-      if (p->second->has_more() && p->second->more()->is_ambiguous_auth &&
-	  p->second->more()->rename_inode->authority().first == who) {
-	dout(10) << " master request " << *p->second << " waiting for renamed inode's auth mds." << who
-		 << " to recover" << dendl;
-	p->second->clear_ambiguous_auth();
+	if (p->second->more()->waiting_on_slave.empty())
+	  mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
       }
 
       if (p->second->locking && p->second->locking_target_mds == who)
@@ -2951,16 +2978,27 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
   dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
   int from = ack->get_source().num();
 
-  if (!resolve_ack_gather.count(from)) {
+  if (!resolve_ack_gather.count(from) ||
+      mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
     ack->put();
     return;
   }
 
+  if (ambiguous_slave_updates.count(from)) {
+    assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
+    assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+  }
+
   for (vector<metareqid_t>::iterator p = ack->commit.begin();
        p != ack->commit.end();
        ++p) {
     dout(10) << " commit on slave " << *p << dendl;
     
+    if (ambiguous_slave_updates.count(from)) {
+      remove_ambiguous_slave_update(*p, from);
+      continue;
+    }
+
     if (mds->is_resolve()) {
       // replay
       MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
@@ -3020,13 +3058,8 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
     }
   }
 
-  if (!mds->is_resolve()) {
-    for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
-	p != active_requests.end(); ++p)
-      assert(p->second->slave_to_mds != from);
-  }
-
-  resolve_ack_gather.erase(from);
+  if (!ambiguous_slave_updates.count(from))
+    resolve_ack_gather.erase(from);
   if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
     send_subtree_resolves();
     process_delayed_resolve();
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 8f262b9..a05ced7 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -327,9 +327,8 @@ protected:
   map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
 
   set<metareqid_t>		pending_masters;
+  map<int, set<metareqid_t> >	ambiguous_slave_updates;
 
-  //map<metareqid_t, bool>     ambiguous_slave_updates;         // for log trimming.
-  //map<metareqid_t, Context*> waiting_for_slave_update_commit;
   friend class ESlaveUpdate;
   friend class ECommitted;
 
@@ -353,6 +352,20 @@ protected:
 public:
   void remove_inode_recursive(CInode *in);
 
+  bool is_ambiguous_slave_update(metareqid_t reqid, int master) {
+    return ambiguous_slave_updates.count(master) &&
+	   ambiguous_slave_updates[master].count(reqid);
+  }
+  void add_ambiguous_slave_update(metareqid_t reqid, int master) {
+    ambiguous_slave_updates[master].insert(reqid);
+  }
+  void remove_ambiguous_slave_update(metareqid_t reqid, int master) {
+    assert(ambiguous_slave_updates[master].count(reqid));
+    ambiguous_slave_updates[master].erase(reqid);
+    if (ambiguous_slave_updates[master].empty())
+      ambiguous_slave_updates.erase(master);
+  }
+
   void add_rollback(metareqid_t reqid, int master) {
     need_resolve_rollback[reqid] = master;
   }
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index 5013f04..de122a5 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -207,6 +207,7 @@ struct MDRequest : public Mutation {
     
     // for rename
     set<int> extra_witnesses; // replica list from srcdn auth (rename)
+    int srcdn_auth_mds;
     version_t src_reanchor_atid;  // src->dst
     version_t dst_reanchor_atid;  // dst->stray
     bufferlist inode_import;
@@ -233,6 +234,7 @@ struct MDRequest : public Mutation {
     bufferlist rollback_bl;
 
     More() : 
+      srcdn_auth_mds(-1),
       src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
       rename_inode(0), is_freeze_authpin(false), is_ambiguous_auth(false),
       is_remote_frozen_authpin(false), is_inode_exporter(false),
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 1330f11..b6e5665 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -5772,12 +5772,52 @@ void Server::handle_client_rename(MDRequest *mdr)
   if (mdr->now == utime_t())
     mdr->now = ceph_clock_now(g_ceph_context);
 
+  // -- prepare anchor updates --
+  if (!linkmerge || srcdnl->is_primary()) {
+    C_GatherBuilder anchorgather(g_ceph_context);
+
+    if (srcdnl->is_primary() &&
+      (srcdnl->get_inode()->is_anchored() ||
+       (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
+                                          srcdnl->get_inode()->nested_anchors ||
+                                          !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
+      !mdr->more()->src_reanchor_atid) {
+      dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
+      vector<Anchor> trace;
+      destdn->make_anchor_trace(trace, srcdnl->get_inode());
+      mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
+					trace, &mdr->more()->src_reanchor_atid,
+					anchorgather.new_sub());
+    }
+    if (destdnl->is_primary() &&
+	destdnl->get_inode()->is_anchored() &&
+	!mdr->more()->dst_reanchor_atid) {
+      dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
+
+      assert(straydn);
+      vector<Anchor> trace;
+      straydn->make_anchor_trace(trace, destdnl->get_inode());
+
+      mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
+		  &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
+    }
+
+    if (anchorgather.has_subs())  {
+      anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
+      anchorgather.activate();
+      return;  // waiting for anchor prepares
+    }
+
+    assert(g_conf->mds_kill_rename_at != 2);
+  }
+
   // -- prepare witnesses --
 
   // do srcdn auth last
   int last = -1;
   if (!srcdn->is_auth()) {
     last = srcdn->authority().first;
+    mdr->more()->srcdn_auth_mds = last;
     // ask auth of srci to mark srci as ambiguous auth if more than two MDS
     // are involved in the rename operation.
     if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
@@ -5803,58 +5843,18 @@ void Server::handle_client_rename(MDRequest *mdr)
   if (!mdr->more()->waiting_on_slave.empty())
     return;  // we're waiting for a witness.
 
-  if (last >= 0 &&
-      mdr->more()->witnessed.count(last) == 0 &&
-      mdr->more()->waiting_on_slave.count(last) == 0) {
+  if (last >= 0 && mdr->more()->witnessed.count(last) == 0) {
     dout(10) << " preparing last witness (srcdn auth)" << dendl;
+    assert(mdr->more()->waiting_on_slave.count(last) == 0);
     _rename_prepare_witness(mdr, last, witnesses, srcdn, destdn, straydn);
     return;
   }
 
   // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
   if (!mdr->more()->slaves.empty() && !srci->is_dir())
-    assert(g_conf->mds_kill_rename_at != 2);
+    assert(g_conf->mds_kill_rename_at != 3);
   if (!mdr->more()->slaves.empty() && srci->is_dir())
-    assert(g_conf->mds_kill_rename_at != 3);    
-  
-  // -- prepare anchor updates -- 
-  if (!linkmerge || srcdnl->is_primary()) {
-    C_GatherBuilder anchorgather(g_ceph_context);
-
-    if (srcdnl->is_primary() &&
-	(srcdnl->get_inode()->is_anchored() || 
-	 (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
-					    srcdnl->get_inode()->nested_anchors ||
-					    !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
-	!mdr->more()->src_reanchor_atid) {
-      dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
-      vector<Anchor> trace;
-      destdn->make_anchor_trace(trace, srcdnl->get_inode());
-      mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
-					trace, &mdr->more()->src_reanchor_atid,
-					anchorgather.new_sub());
-    }
-    if (destdnl->is_primary() &&
-	destdnl->get_inode()->is_anchored() &&
-	!mdr->more()->dst_reanchor_atid) {
-      dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
-
-      assert(straydn);
-      vector<Anchor> trace;
-      straydn->make_anchor_trace(trace, destdnl->get_inode());
-      
-      mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
-		  &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
-    }
-
-    if (anchorgather.has_subs())  {
-      anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
-      anchorgather.activate();
-      return;  // waiting for anchor prepares
-    }
-
     assert(g_conf->mds_kill_rename_at != 4);
-  }
 
   // -- prepare journal entry --
   mdr->ls = mdlog->get_current_segment();
@@ -6762,10 +6762,17 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
     // abort
     //  rollback_bl may be empty if we froze the inode but had to provide an expanded
     // witness list from the master, and they failed before we tried prep again.
-    if (mdr->more()->rollback_bl.length())
-      do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
-    else
+    if (mdr->more()->rollback_bl.length()) {
+      if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
+	mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
+	// rollback but preserve the slave request
+	do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL);
+      } else
+	do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
+    } else {
       dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
+      mds->mdcache->request_finish(mdr);
+    }
   }
 }
 
@@ -6825,7 +6832,6 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
   dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
   // need to finish this update before sending resolve to claim the subtree
   mds->mdcache->add_rollback(rollback.reqid, master);
-  assert(mdr || mds->is_resolve());
 
   Mutation *mut = new Mutation(rollback.reqid);
   mut->ls = mds->mdlog->get_current_segment();
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 19/39] mds: remove MDCache::rejoin_fetch_dirfrags()
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (17 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 18/39] mds: fix MDS recovery involving cross authority rename Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 22:58   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong Yan, Zheng
                   ` (21 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

In commit 77946dcdae (mds: fetch missing inodes from disk), I introduced
MDCache::rejoin_fetch_dirfrags(). But it basicly duplicates the function
of MDCache::open_undef_dirfrags(), so just remove rejoin_fetch_dirfrags()
and make open_undef_dirfrags() also handle undefined inodes.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/CDir.cc    |  70 +++++++++++--------
 src/mds/MDCache.cc | 193 +++++++++++++++++------------------------------------
 src/mds/MDCache.h  |   5 +-
 3 files changed, 107 insertions(+), 161 deletions(-)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 231630e..af0ae9c 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1553,33 +1553,32 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
       if (stale)
 	continue;
 
+      bool undef_inode = false;
       if (dn) {
-        if (dn->get_linkage()->get_inode() == 0) {
-          dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
-        } else {
-          dout(12) << "_fetched  had dentry " << *dn << dendl;
-        }
-      } else {
+	CInode *in = dn->get_linkage()->get_inode();
+	if (in) {
+	  dout(12) << "_fetched  had dentry " << *dn << dendl;
+	  if (in->state_test(CInode::STATE_REJOINUNDEF)) {
+	    assert(cache->mds->is_rejoin());
+	    assert(in->vino() == vinodeno_t(inode.ino, last));
+	    in->state_clear(CInode::STATE_REJOINUNDEF);
+	    cache->opened_undef_inode(in);
+	    undef_inode = true;
+	  }
+	} else
+	  dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
+      }
+
+      if (!dn || undef_inode) {
 	// add inode
 	CInode *in = cache->get_inode(inode.ino, last);
-	if (in) {
-	  dout(0) << "_fetched  badness: got (but i already had) " << *in
-		  << " mode " << in->inode.mode
-		  << " mtime " << in->inode.mtime << dendl;
-	  string dirpath, inopath;
-	  this->inode->make_path_string(dirpath);
-	  in->make_path_string(inopath);
-	  clog.error() << "loaded dup inode " << inode.ino
-	    << " [" << first << "," << last << "] v" << inode.version
-	    << " at " << dirpath << "/" << dname
-	    << ", but inode " << in->vino() << " v" << in->inode.version
-	    << " already exists at " << inopath << "\n";
-	  continue;
-	} else {
-	  // inode
-	  in = new CInode(cache, true, first, last);
-	  in->inode = inode;
+	if (!in || undef_inode) {
+	  if (undef_inode)
+	    in->first = first;
+	  else
+	    in = new CInode(cache, true, first, last);
 	  
+	  in->inode = inode;
 	  // symlink?
 	  if (in->is_symlink()) 
 	    in->symlink = symlink;
@@ -1591,11 +1590,13 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
 	  if (snaps)
 	    in->purge_stale_snap_data(*snaps);
 
-	  // add 
-	  cache->add_inode( in );
-	
-	  // link
-	  dn = add_primary_dentry(dname, in, first, last);
+	  if (undef_inode) {
+	    if (inode.anchored)
+	      dn->adjust_nested_anchors(1);
+	  } else {
+	    cache->add_inode( in ); // add
+	    dn = add_primary_dentry(dname, in, first, last); // link
+	  }
 	  dout(12) << "_fetched  got " << *dn << " " << *in << dendl;
 
 	  if (in->inode.is_dirty_rstat())
@@ -1604,6 +1605,19 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
 	  //in->hack_accessed = false;
 	  //in->hack_load_stamp = ceph_clock_now(g_ceph_context);
 	  //num_new_inodes_loaded++;
+	} else {
+	  dout(0) << "_fetched  badness: got (but i already had) " << *in
+		  << " mode " << in->inode.mode
+		  << " mtime " << in->inode.mtime << dendl;
+	  string dirpath, inopath;
+	  this->inode->make_path_string(dirpath);
+	  in->make_path_string(inopath);
+	  clog.error() << "loaded dup inode " << inode.ino
+	    << " [" << first << "," << last << "] v" << inode.version
+	    << " at " << dirpath << "/" << dname
+	    << ", but inode " << in->vino() << " v" << in->inode.version
+	    << " already exists at " << inopath << "\n";
+	  continue;
 	}
       }
     } else {
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index d934020..008a8a2 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -4178,7 +4178,6 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
 
 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
 {
-  assert(0);
   CInode *in = new CInode(this, true, 1, last);
   in->inode.ino = ino;
   in->state_set(CInode::STATE_REJOINUNDEF);
@@ -4190,16 +4189,13 @@ CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
 
 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
 {
-  assert(0);
   CInode *in = get_inode(df.ino);
-  if (!in) {
+  if (!in)
     in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
-    if (!in->is_dir()) {
-      assert(in->state_test(CInode::STATE_REJOINUNDEF));
-      in->inode.mode = S_IFDIR;
-    }
+  if (!in->is_dir()) {
+    assert(in->state_test(CInode::STATE_REJOINUNDEF));
+    in->inode.mode = S_IFDIR;
   }
-  assert(in->is_dir());
   CDir *dir = in->get_or_open_dirfrag(this, df.frag);
   dir->state_set(CDir::STATE_REJOINUNDEF);
   rejoin_undef_dirfrags.insert(dir);
@@ -4207,81 +4203,6 @@ CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
   return dir;
 }
 
-bool MDCache::rejoin_fetch_dirfrags(MMDSCacheRejoin *strong)
-{
-  int skipped = 0;
-  set<CDir*> fetch_queue;
-  for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
-       p != strong->strong_dirfrags.end();
-       ++p) {
-    CInode *diri = get_inode(p->first.ino);
-    if (!diri) {
-      skipped++;
-      continue;
-    }
-    CDir *dir = diri->get_dirfrag(p->first.frag);
-    if (dir && dir->is_complete())
-      continue;
-
-    set<CDir*> frags;
-    bool refragged = false;
-    if (!dir) {
-      if (diri->dirfragtree.is_leaf(p->first.frag))
-	dir = diri->get_or_open_dirfrag(this, p->first.frag);
-      else {
-	list<frag_t> ls;
-	diri->dirfragtree.get_leaves_under(p->first.frag, ls);
-	if (ls.empty())
-	  ls.push_back(diri->dirfragtree[p->first.frag.value()]);
-	for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
-	  dir = diri->get_or_open_dirfrag(this, p->first.frag);
-	  frags.insert(dir);
-	}
-	refragged = true;
-      }
-    }
-
-    map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
-    for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
-	q != dmap.end();
-	++q) {
-      if (!q->second.is_primary())
-	continue;
-      CDentry *dn;
-      if (!refragged)
-	dn = dir->lookup(q->first.name, q->first.snapid);
-      else {
-	frag_t fg = diri->pick_dirfrag(q->first.name);
-	dir = diri->get_dirfrag(fg);
-	assert(dir);
-	dn = dir->lookup(q->first.name, q->first.snapid);
-      }
-      if (!dn) {
-	fetch_queue.insert(dir);
-	if (!refragged)
-	  break;
-	frags.erase(dir);
-	if (frags.empty())
-	  break;
-      }
-    }
-  }
-
-  if (!fetch_queue.empty()) {
-    dout(10) << "rejoin_fetch_dirfrags " << fetch_queue.size() << " dirfrags" << dendl;
-    strong->get();
-    C_GatherBuilder gather(g_ceph_context, new C_MDS_RetryMessage(mds, strong));
-    for (set<CDir*>::iterator p = fetch_queue.begin(); p != fetch_queue.end(); ++p) {
-      CDir *dir = *p;
-      dir->fetch(gather.new_sub());
-    }
-    gather.activate();
-    return true;
-  }
-  assert(!skipped);
-  return false;
-}
-
 /* This functions DOES NOT put the passed message before returning */
 void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 {
@@ -4290,11 +4211,6 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
   // only a recovering node will get a strong rejoin.
   assert(mds->is_rejoin());
 
-  if (rejoin_fetch_dirfrags(strong))
-    return;
-
-  MMDSCacheRejoin *missing = 0;  // if i'm missing something..
-  
   // assimilate any potentially dirty scatterlock state
   for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
        p != strong->inode_scatterlocks.end();
@@ -4319,12 +4235,16 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
        p != strong->strong_dirfrags.end();
        ++p) {
     CInode *diri = get_inode(p->first.ino);
+    if (!diri)
+      diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
     CDir *dir = diri->get_dirfrag(p->first.frag);
     bool refragged = false;
     if (dir) {
       dout(10) << " have " << *dir << dendl;
     } else {
-      if (diri->dirfragtree.is_leaf(p->first.frag))
+      if (diri->state_test(CInode::STATE_REJOINUNDEF))
+	dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
+      else if (diri->dirfragtree.is_leaf(p->first.frag))
 	dir = rejoin_invent_dirfrag(p->first);
     }
     if (dir) {
@@ -4369,15 +4289,9 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	} else if (q->second.is_null()) {
 	  dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
 	} else {
-	  assert(0);
 	  CInode *in = get_inode(q->second.ino, q->first.snapid);
 	  if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
 	  dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
-
-	  dout(10) << " missing " << q->second.ino << "." << q->first.snapid << dendl;
-	  if (!missing)
-	    missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING);
-	  missing->add_weak_inode(vinodeno_t(q->second.ino, q->first.snapid));  // we want it back!
 	}
 	dout(10) << " invented " << *dn << dendl;
       }
@@ -4513,19 +4427,15 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
     in->add_replica(from);
   }
 
-  // send missing?
-  if (missing) {
-    // we expect a FULL soon.
-    mds->send_message(missing, strong->get_connection());
+
+
+  // done?
+  assert(rejoin_gather.count(from));
+  rejoin_gather.erase(from);
+  if (rejoin_gather.empty()) {
+    rejoin_gather_finish();
   } else {
-    // done?
-    assert(rejoin_gather.count(from));
-    rejoin_gather.erase(from);
-    if (rejoin_gather.empty()) {
-      rejoin_gather_finish();
-    } else {
-      dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
-    }
+    dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
   }
 }
 
@@ -4800,7 +4710,8 @@ void MDCache::rejoin_gather_finish()
   dout(10) << "rejoin_gather_finish" << dendl;
   assert(mds->is_rejoin());
 
-  rejoin_trim_undef_inodes();
+  if (open_undef_inodes_dirfrags())
+    return;
 
   // fetch paths?
   //  do this before ack, since some inodes we may have already gotten
@@ -5152,44 +5063,62 @@ void MDCache::open_snap_parents()
     gather.set_finisher(new C_MDC_OpenSnapParents(this));
     gather.activate();
   } else {
+    assert(rejoin_waiters.empty());
     assert(missing_snap_parents.empty());
     assert(reconnected_snaprealms.empty());
     dout(10) << "open_snap_parents - all open" << dendl;
     do_delayed_cap_imports();
 
-    open_undef_dirfrags();
+    start_files_to_recover(rejoin_recover_q, rejoin_check_q);
+    mds->rejoin_done();
   }
 }
 
-struct C_MDC_OpenUndefDirfragsFinish : public Context {
-  MDCache *cache;
-  C_MDC_OpenUndefDirfragsFinish(MDCache *c) : cache(c) {}
-  void finish(int r) {
-    cache->open_undef_dirfrags();
+bool MDCache::open_undef_inodes_dirfrags()
+{
+  dout(10) << "open_undef_inodes_dirfrags "
+	   << rejoin_undef_inodes.size() << " inodes "
+	   << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
+
+  set<CDir*> fetch_queue = rejoin_undef_dirfrags;
+
+  for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
+       p != rejoin_undef_inodes.end();
+       ++p) {
+    CInode *in = *p;
+    assert(!in->is_base());
+    fetch_queue.insert(in->get_parent_dir());
   }
-};
 
-void MDCache::open_undef_dirfrags()
-{
-  dout(10) << "open_undef_dirfrags " << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
-  
-  C_GatherBuilder gather(g_ceph_context);
-  for (set<CDir*>::iterator p = rejoin_undef_dirfrags.begin();
-       p != rejoin_undef_dirfrags.end();
+  if (fetch_queue.empty())
+    return false;
+
+  C_GatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
+  for (set<CDir*>::iterator p = fetch_queue.begin();
+       p != fetch_queue.end();
        ++p) {
     CDir *dir = *p;
+    CInode *diri = dir->get_inode();
+    if (diri->state_test(CInode::STATE_REJOINUNDEF))
+      continue;
+    if (dir->state_test(CDir::STATE_REJOINUNDEF) && dir->get_frag() == frag_t()) {
+      rejoin_undef_dirfrags.erase(dir);
+      dir->state_clear(CDir::STATE_REJOINUNDEF);
+      diri->force_dirfrags();
+      list<CDir*> ls;
+      diri->get_dirfrags(ls);
+      for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
+	rejoin_undef_dirfrags.insert(*q);
+	(*q)->state_set(CDir::STATE_REJOINUNDEF);
+	(*q)->fetch(gather.new_sub());
+      }
+      continue;
+    }
     dir->fetch(gather.new_sub());
   }
-
-  if (gather.has_subs()) {
-    gather.set_finisher(new C_MDC_OpenUndefDirfragsFinish(this));
-    gather.activate();
-  }
-  else {
-    start_files_to_recover(rejoin_recover_q, rejoin_check_q);
-    mds->queue_waiters(rejoin_waiters);
-    mds->rejoin_done();
-  }
+  assert(gather.has_subs());
+  gather.activate();
+  return true;
 }
 
 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index a05ced7..85f5d65 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -496,10 +496,13 @@ public:
   void check_realm_past_parents(SnapRealm *realm);
   void open_snap_parents();
 
-  void open_undef_dirfrags();
+  bool open_undef_inodes_dirfrags();
   void opened_undef_dirfrag(CDir *dir) {
     rejoin_undef_dirfrags.erase(dir);
   }
+  void opened_undef_inode(CInode *in) {
+    rejoin_undef_inodes.erase(in);
+  }
 
   void reissue_all_caps();
   
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (18 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 19/39] mds: remove MDCache::rejoin_fetch_dirfrags() Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 23:26   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack Yan, Zheng
                   ` (20 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

So the recovering MDS can properly handle cache expire messages.
Also increase the nonce value when sending the cache rejoin acks.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc             | 35 +++++++++++++++++++++++------------
 src/messages/MMDSCacheRejoin.h | 11 +++++++----
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 008a8a2..8ba676e 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3538,6 +3538,7 @@ void MDCache::rejoin_send_rejoins()
       if (p->first == 0 && root) {
 	p->second->add_weak_inode(root->vino());
 	p->second->add_strong_inode(root->vino(),
+				    root->get_replica_nonce(),
 				    root->get_caps_wanted(),
 				    root->filelock.get_state(),
 				    root->nestlock.get_state(),
@@ -3551,6 +3552,7 @@ void MDCache::rejoin_send_rejoins()
       if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
 	p->second->add_weak_inode(in->vino());
 	p->second->add_strong_inode(in->vino(),
+				    in->get_replica_nonce(),
 				    in->get_caps_wanted(),
 				    in->filelock.get_state(),
 				    in->nestlock.get_state(),
@@ -3709,6 +3711,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
 	CInode *in = dnl->get_inode();
 	dout(15) << " add_strong_inode " << *in << dendl;
 	rejoin->add_strong_inode(in->vino(),
+				 in->get_replica_nonce(),
 				 in->get_caps_wanted(),
 				 in->filelock.get_state(),
 				 in->nestlock.get_state(),
@@ -4248,7 +4251,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	dir = rejoin_invent_dirfrag(p->first);
     }
     if (dir) {
-      dir->add_replica(from);
+      dir->add_replica(from, p->second.nonce);
       dir->dir_rep = p->second.dir_rep;
     } else {
       dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
@@ -4263,7 +4266,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	  dir = rejoin_invent_dirfrag(p->first);
 	else
 	  dout(10) << " have(approx) " << *dir << dendl;
-	dir->add_replica(from);
+	dir->add_replica(from, p->second.nonce);
 	dir->dir_rep = p->second.dir_rep;
       }
       refragged = true;
@@ -4327,7 +4330,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	mdr->locks.insert(&dn->lock);
       }
 
-      dn->add_replica(from);
+      dn->add_replica(from, q->second.nonce);
       dout(10) << " have " << *dn << dendl;
       
       // inode?
@@ -4412,7 +4415,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	  dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
 	}
 	
-	in->add_replica(from);
+	in->add_replica(from, p->second.nonce);
 	dout(10) << " have " << *in << dendl;
       }
     }
@@ -5176,7 +5179,7 @@ void MDCache::rejoin_send_acks()
       for (map<int,int>::iterator r = dir->replicas_begin();
 	   r != dir->replicas_end();
 	   ++r) 
-	ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second, dir->dir_rep);
+	ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
 	   
       for (CDir::map_t::iterator q = dir->items.begin();
 	   q != dir->items.end();
@@ -5192,7 +5195,7 @@ void MDCache::rejoin_send_acks()
 					   dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
 					   dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
 					   dnl->is_remote() ? dnl->get_remote_d_type():0,
-					   r->second,
+					   ++r->second,
 					   dn->lock.get_replica_state());
 	
 	if (!dnl->is_primary())
@@ -5205,7 +5208,7 @@ void MDCache::rejoin_send_acks()
 	     r != in->replicas_end();
 	     ++r) {
 	  ack[r->first]->add_inode_base(in);
-	  ack[r->first]->add_inode_locks(in, r->second);
+	  ack[r->first]->add_inode_locks(in, ++r->second);
 	}
 	
 	// subdirs in this subtree?
@@ -5220,14 +5223,14 @@ void MDCache::rejoin_send_acks()
 	 r != root->replicas_end();
 	 ++r) {
       ack[r->first]->add_inode_base(root);
-      ack[r->first]->add_inode_locks(root, r->second);
+      ack[r->first]->add_inode_locks(root, ++r->second);
     }
   if (myin)
     for (map<int,int>::iterator r = myin->replicas_begin();
 	 r != myin->replicas_end();
 	 ++r) {
       ack[r->first]->add_inode_base(myin);
-      ack[r->first]->add_inode_locks(myin, r->second);
+      ack[r->first]->add_inode_locks(myin, ++r->second);
     }
 
   // include inode base for any inodes whose scatterlocks may have updated
@@ -5728,6 +5731,12 @@ void MDCache::send_expire_messages(map<int, MCacheExpire*>& expiremap)
   for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
        it != expiremap.end();
        ++it) {
+    if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
+	(mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
+	 rejoin_sent.count(it->first) == 0)) {
+      it->second->put();
+      continue;
+    }
     dout(7) << "sending cache_expire to " << it->first << dendl;
     mds->send_message_mds(it->second, it->first);
   }
@@ -9640,9 +9649,11 @@ void MDCache::handle_dentry_link(MDentryLink *m)
     CInode *in = add_replica_inode(p, NULL, finished);
     assert(in->get_num_ref() == 0);
     assert(in->get_parent_dn() == NULL);
-    MCacheExpire* expire = new MCacheExpire(mds->get_nodeid());
-    expire->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
-    mds->send_message_mds(expire, m->get_source().num());
+    map<int, MCacheExpire*> expiremap;
+    int from = m->get_source().num();
+    expiremap[from] = new MCacheExpire(mds->get_nodeid());
+    expiremap[from]->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
+    send_expire_messages(expiremap);
     remove_inode(in);
   }
 
diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
index 825400d..b88f551 100644
--- a/src/messages/MMDSCacheRejoin.h
+++ b/src/messages/MMDSCacheRejoin.h
@@ -43,19 +43,22 @@ class MMDSCacheRejoin : public Message {
 
   // -- types --
   struct inode_strong { 
+    int32_t nonce;
     int32_t caps_wanted;
     int32_t filelock, nestlock, dftlock;
     inode_strong() {}
-    inode_strong(int cw, int dl, int nl, int dftl) : 
-      caps_wanted(cw),
+    inode_strong(int n, int cw, int dl, int nl, int dftl) :
+      nonce(n), caps_wanted(cw),
       filelock(dl), nestlock(nl), dftlock(dftl) { }
     void encode(bufferlist &bl) const {
+      ::encode(nonce, bl);
       ::encode(caps_wanted, bl);
       ::encode(filelock, bl);
       ::encode(nestlock, bl);
       ::encode(dftlock, bl);
     }
     void decode(bufferlist::iterator &bl) {
+      ::decode(nonce, bl);
       ::decode(caps_wanted, bl);
       ::decode(filelock, bl);
       ::decode(nestlock, bl);
@@ -208,8 +211,8 @@ public:
   void add_weak_inode(vinodeno_t i) {
     weak_inodes.insert(i);
   }
-  void add_strong_inode(vinodeno_t i, int cw, int dl, int nl, int dftl) {
-    strong_inodes[i] = inode_strong(cw, dl, nl, dftl);
+  void add_strong_inode(vinodeno_t i, int n, int cw, int dl, int nl, int dftl) {
+    strong_inodes[i] = inode_strong(n, cw, dl, nl, dftl);
   }
   void add_inode_locks(CInode *in, __u32 nonce) {
     ::encode(in->inode.ino, inode_locks);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (19 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 23:33   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 22/39] mds: handle linkage mismatch during cache rejoin Yan, Zheng
                   ` (19 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Cache rejoin ack message already encodes inode base, make it also encode
dirfrag base. This allowes the message to replicate stray dentries like
MDentryUnlink message. The function will be used by later patch.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/CDir.h                 | 20 +++++++++++++-------
 src/mds/MDCache.cc             | 20 ++++++++++++++++++--
 src/messages/MMDSCacheRejoin.h | 12 +++++++++++-
 3 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 79946f1..f4a3a3d 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -437,23 +437,29 @@ private:
     ::encode(dist, bl);
   }
 
-  void encode_replica(int who, bufferlist& bl) {
-    __u32 nonce = add_replica(who);
-    ::encode(nonce, bl);
+  void _encode_base(bufferlist& bl) {
     ::encode(first, bl);
     ::encode(fnode, bl);
     ::encode(dir_rep, bl);
     ::encode(dir_rep_by, bl);
   }
-  void decode_replica(bufferlist::iterator& p) {
-    __u32 nonce;
-    ::decode(nonce, p);
-    replica_nonce = nonce;
+  void _decode_base(bufferlist::iterator& p) {
     ::decode(first, p);
     ::decode(fnode, p);
     ::decode(dir_rep, p);
     ::decode(dir_rep_by, p);
   }
+  void encode_replica(int who, bufferlist& bl) {
+    __u32 nonce = add_replica(who);
+    ::encode(nonce, bl);
+    _encode_base(bl);
+  }
+  void decode_replica(bufferlist::iterator& p) {
+    __u32 nonce;
+    ::decode(nonce, p);
+    replica_nonce = nonce;
+    _decode_base(p);
+  }
 
 
 
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 8ba676e..344777e 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -4510,8 +4510,22 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
     }
   }
 
+  // full dirfrags
+  bufferlist::iterator p = ack->dirfrag_base.begin();
+  while (!p.end()) {
+    dirfrag_t df;
+    bufferlist basebl;
+    ::decode(df, p);
+    ::decode(basebl, p);
+    CDir *dir = get_dirfrag(df);
+    assert(dir);
+    bufferlist::iterator q = basebl.begin();
+    dir->_decode_base(q);
+    dout(10) << " got dir replica " << *dir << dendl;
+  }
+
   // full inodes
-  bufferlist::iterator p = ack->inode_base.begin();
+  p = ack->inode_base.begin();
   while (!p.end()) {
     inodeno_t ino;
     snapid_t last;
@@ -5178,8 +5192,10 @@ void MDCache::rejoin_send_acks()
       // dir
       for (map<int,int>::iterator r = dir->replicas_begin();
 	   r != dir->replicas_end();
-	   ++r) 
+	   ++r) {
 	ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
+	ack[r->first]->add_dirfrag_base(dir);
+      }
 	   
       for (CDir::map_t::iterator q = dir->items.begin();
 	   q != dir->items.end();
diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
index b88f551..7c37ab4 100644
--- a/src/messages/MMDSCacheRejoin.h
+++ b/src/messages/MMDSCacheRejoin.h
@@ -20,6 +20,7 @@
 #include "include/types.h"
 
 #include "mds/CInode.h"
+#include "mds/CDir.h"
 
 // sent from replica to auth
 
@@ -169,6 +170,7 @@ class MMDSCacheRejoin : public Message {
   // full
   bufferlist inode_base;
   bufferlist inode_locks;
+  bufferlist dirfrag_base;
 
   // authpins, xlocks
   struct slave_reqid {
@@ -258,7 +260,13 @@ public:
   void add_strong_dirfrag(dirfrag_t df, int n, int dr) {
     strong_dirfrags[df] = dirfrag_strong(n, dr);
   }
-   
+  void add_dirfrag_base(CDir *dir) {
+    ::encode(dir->dirfrag(), dirfrag_base);
+    bufferlist bl;
+    dir->_encode_base(bl);
+    ::encode(bl, dirfrag_base);
+  }
+
   // dentries
   void add_weak_dirfrag(dirfrag_t df) {
     weak_dirfrags.insert(df);
@@ -294,6 +302,7 @@ public:
     ::encode(wrlocked_inodes, payload);
     ::encode(cap_export_bl, payload);
     ::encode(strong_dirfrags, payload);
+    ::encode(dirfrag_base, payload);
     ::encode(weak, payload);
     ::encode(weak_dirfrags, payload);
     ::encode(weak_inodes, payload);
@@ -319,6 +328,7 @@ public:
       ::decode(cap_export_paths, q);
     }
     ::decode(strong_dirfrags, p);
+    ::decode(dirfrag_base, p);
     ::decode(weak, p);
     ::decode(weak_dirfrags, p);
     ::decode(weak_inodes, p);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 22/39] mds: handle linkage mismatch during cache rejoin
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (20 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21 21:23   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 23/39] mds: reqid for rejoinning authpin/wrlock need to be list Yan, Zheng
                   ` (18 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

For MDS cluster, not all file system namespace operations that impact
multiple MDS use two phase commit. Some operations use dentry link/unlink
message to update replica dentry's linkage after they are committed by
the master MDS. It's possible the master MDS crashes after journaling an
operation, but before sending the dentry link/unlink messages. Later when
the MDS recovers and receives cache rejoin messages from the surviving
MDS, it will find linkage mismatch.

The original cache rejoin code does not properly handle the case that
dentry unlink messages were missing. Unlinked inodes were linked to stray
dentries. So the cache rejoin ack message need push replicas of these
stray dentries to the surviving MDS.

This patch also adds code that handles cache expiration in the middle of
cache rejoining.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 348 +++++++++++++++++++++++++++++++++++------------------
 src/mds/MDCache.h  |   1 +
 2 files changed, 233 insertions(+), 116 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 344777e..38b1fdf 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3536,7 +3536,6 @@ void MDCache::rejoin_send_rejoins()
     } else {
       // strong
       if (p->first == 0 && root) {
-	p->second->add_weak_inode(root->vino());
 	p->second->add_strong_inode(root->vino(),
 				    root->get_replica_nonce(),
 				    root->get_caps_wanted(),
@@ -3550,7 +3549,6 @@ void MDCache::rejoin_send_rejoins()
       }
 
       if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
-	p->second->add_weak_inode(in->vino());
 	p->second->add_strong_inode(in->vino(),
 				    in->get_replica_nonce(),
 				    in->get_caps_wanted(),
@@ -3567,6 +3565,8 @@ void MDCache::rejoin_send_rejoins()
     for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
 	 p != active_requests.end();
 	 ++p) {
+      if ( p->second->is_slave())
+	continue;
       // auth pins
       for (set<MDSCacheObject*>::iterator q = p->second->remote_auth_pins.begin();
 	   q != p->second->remote_auth_pins.end();
@@ -4226,6 +4226,8 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
     rejoin_potential_updated_scatterlocks.insert(in);
   }
 
+  rejoin_unlinked_inodes[from].clear();
+
   // surviving peer may send incorrect dirfrag here (maybe they didn't
   // get the fragment notify, or maybe we rolled back?).  we need to
   // infer the right frag and get them with the program.  somehow.
@@ -4332,105 +4334,125 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 
       dn->add_replica(from, q->second.nonce);
       dout(10) << " have " << *dn << dendl;
-      
-      // inode?
-      if (dnl->is_primary()) {
-	CInode *in = dnl->get_inode();
-	assert(in);
-
-	if (strong->strong_inodes.count(in->vino())) {
-	  MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->vino()];
 
-	  // caps_wanted
-	  if (is.caps_wanted) {
-	    in->mds_caps_wanted[from] = is.caps_wanted;
-	    dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
-		     << " on " << *in << dendl;
-	  } 
-	  
-	  // scatterlocks?
-	  //  infer state from replica state:
-	  //   * go to MIX if they might have wrlocks
-	  //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
-	  in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
-	  in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
-	  in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
-	  
-	  // auth pin?
-	  if (strong->authpinned_inodes.count(in->vino())) {
-	    MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
-	    dout(10) << " inode authpin by " << r << " on " << *in << dendl;
-	    
-	    // get/create slave mdrequest
-	    MDRequest *mdr;
-	    if (have_request(r.reqid))
-	      mdr = request_get(r.reqid);
-	    else
-	      mdr = request_start_slave(r.reqid, r.attempt, from);
-	    if (strong->frozen_authpin_inodes.count(in->vino())) {
-	      assert(!in->get_num_auth_pins());
-	      mdr->freeze_auth_pin(in);
-	    } else {
-	      assert(!in->is_frozen_auth_pin());
-	    }
-	    mdr->auth_pin(in);
-	  }
-	  // xlock(s)?
-	  if (strong->xlocked_inodes.count(in->vino())) {
-	    for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->xlocked_inodes[in->vino()].begin();
-		 r != strong->xlocked_inodes[in->vino()].end();
-		 ++r) {
-	      SimpleLock *lock = in->get_lock(r->first);
-	      dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl;
-	      MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
-	      assert(mdr->is_auth_pinned(in));
-	      if (lock->is_stable())
-		in->auth_pin(lock);
-	      lock->set_state(LOCK_XLOCK);
-	      if (lock == &in->filelock)
-		in->loner_cap = -1;
-	      lock->get_xlock(mdr, mdr->get_client());
-	      mdr->xlocks.insert(lock);
-	      mdr->locks.insert(lock);
-	    }
-	  }
-	  // wrlock(s)?
-	  if (strong->wrlocked_inodes.count(in->vino())) {
-	    for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->wrlocked_inodes[in->vino()].begin();
-		 r != strong->wrlocked_inodes[in->vino()].end();
-		 ++r) {
-	      SimpleLock *lock = in->get_lock(r->first);
-	      dout(10) << " inode wrlock by " << r->second << " on " << *lock << " on " << *in << dendl;
-	      MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
-	      assert(mdr->is_auth_pinned(in));
-	      lock->set_state(LOCK_LOCK);
-	      if (lock == &in->filelock)
-		in->loner_cap = -1;
-	      lock->get_wrlock(true);
-	      mdr->wrlocks.insert(lock);
-	      mdr->locks.insert(lock);
-	    }
+      if (dnl->is_primary()) {
+	if (q->second.is_primary()) {
+	  if (!(vinodeno_t(q->second.ino, q->first.snapid) == dnl->get_inode()->vino())) {
+	    // the survivor missed MDentryUnlink+MDentryLink messages ?
+	    assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+	    CInode *in = get_inode(q->second.ino, q->first.snapid);
+	    assert(in);
+	    rejoin_unlinked_inodes[from].insert(in);
+	    dout(7) << " sender has primary dentry but wrong inode" << dendl;
 	  }
 	} else {
-	  dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
+	  // the survivor missed MDentryLink message ?
+	  assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+	  dout(7) << " sender doesn't primay dentry" << dendl;
+	}
+      } else {
+	if (q->second.is_primary()) {
+	  // the survivor missed MDentryUnlink message ?
+	  CInode *in = get_inode(q->second.ino, q->first.snapid);
+	  assert(in);
+	  rejoin_unlinked_inodes[from].insert(in);
+	  dout(7) << " sender has primary dentry but we don't" << dendl;
 	}
-	
-	in->add_replica(from, p->second.nonce);
-	dout(10) << " have " << *in << dendl;
       }
     }
   }
 
-  // base inodes?  (root, stray, etc.)
-  for (set<vinodeno_t>::iterator p = strong->weak_inodes.begin();
-       p != strong->weak_inodes.end();
+  for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
+       p != strong->strong_inodes.end();
        ++p) {
-    CInode *in = get_inode(*p);
-    dout(10) << " have base " << *in << dendl;
-    in->add_replica(from);
+    CInode *in = get_inode(p->first);
+    assert(in);
+    in->add_replica(from, p->second.nonce);
+    dout(10) << " have " << *in << dendl;
+
+    MMDSCacheRejoin::inode_strong &is = p->second;
+
+    // caps_wanted
+    if (is.caps_wanted) {
+      in->mds_caps_wanted[from] = is.caps_wanted;
+      dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
+	       << " on " << *in << dendl;
+    }
+
+    // scatterlocks?
+    //  infer state from replica state:
+    //   * go to MIX if they might have wrlocks
+    //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
+    in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
+    in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
+    in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
+
+    // auth pin?
+    if (strong->authpinned_inodes.count(in->vino())) {
+      MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
+      dout(10) << " inode authpin by " << r << " on " << *in << dendl;
+
+      // get/create slave mdrequest
+      MDRequest *mdr;
+      if (have_request(r.reqid))
+	mdr = request_get(r.reqid);
+      else
+	mdr = request_start_slave(r.reqid, r.attempt, from);
+      if (strong->frozen_authpin_inodes.count(in->vino())) {
+	assert(!in->get_num_auth_pins());
+	mdr->freeze_auth_pin(in);
+      } else {
+	assert(!in->is_frozen_auth_pin());
+      }
+      mdr->auth_pin(in);
+    }
+    // xlock(s)?
+    if (strong->xlocked_inodes.count(in->vino())) {
+      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
+	   q != strong->xlocked_inodes[in->vino()].end();
+	   ++q) {
+	SimpleLock *lock = in->get_lock(q->first);
+	dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
+	MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
+	assert(mdr->is_auth_pinned(in));
+	if (lock->is_stable())
+	  in->auth_pin(lock);
+	lock->set_state(LOCK_XLOCK);
+	if (lock == &in->filelock)
+	  in->loner_cap = -1;
+	lock->get_xlock(mdr, mdr->get_client());
+	mdr->xlocks.insert(lock);
+	mdr->locks.insert(lock);
+      }
+    }
+    // wrlock(s)?
+    if (strong->wrlocked_inodes.count(in->vino())) {
+      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->wrlocked_inodes[in->vino()].begin();
+	   q != strong->wrlocked_inodes[in->vino()].end();
+	   ++q) {
+	SimpleLock *lock = in->get_lock(q->first);
+	dout(10) << " inode wrlock by " << q->second << " on " << *lock << " on " << *in << dendl;
+	MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
+	assert(mdr->is_auth_pinned(in));
+	lock->set_state(LOCK_LOCK);
+	if (lock == &in->filelock)
+	  in->loner_cap = -1;
+	lock->get_wrlock(true);
+	mdr->wrlocks.insert(lock);
+	mdr->locks.insert(lock);
+      }
+    }
   }
 
-
+  // unlinked inodes should be in stray
+  for (set<CInode*>::iterator p = rejoin_unlinked_inodes[from].begin();
+       p != rejoin_unlinked_inodes[from].end();
+       ++p) {
+    CInode *in = *p;
+    dout(7) << " unlinked inode " << *in << dendl;
+    assert(in->get_parent_dn());
+    assert(in->is_replica(from));
+  }
 
   // done?
   assert(rejoin_gather.count(from));
@@ -4448,6 +4470,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
   dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
   int from = ack->get_source().num();
 
+  // for sending cache expire message
+  list<CInode*> isolated_inodes;
+
   // dirs
   for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
        p != ack->strong_dirfrags.end();
@@ -4455,7 +4480,29 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
     // we may have had incorrect dir fragmentation; refragment based
     // on what they auth tells us.
     CDir *dir = get_force_dirfrag(p->first);
-    assert(dir);
+    if (!dir) {
+      CInode *diri = get_inode(p->first.ino);
+      if (!diri) {
+	// barebones inode; the full inode loop below will clean up.
+	diri = new CInode(this, false);
+	diri->inode.ino = p->first.ino;
+	diri->inode.mode = S_IFDIR;
+	if (MDS_INO_MDSDIR(p->first.ino)) {
+	  diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN);
+	  add_inode(diri);
+	  dout(10) << " add inode " << *diri << dendl;
+	} else {
+	  diri->inode_auth = CDIR_AUTH_UNDEF;
+	  isolated_inodes.push_back(diri);
+	  dout(10) << " unconnected dirfrag " << p->first << dendl;
+	}
+      }
+      // barebones dirfrag; the full dirfrag loop below will clean up.
+      dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
+      if (dir->authority().first != from)
+	adjust_subtree_auth(dir, from);
+      dout(10) << " add dirfrag " << *dir << dendl;
+    }
 
     dir->set_replica_nonce(p->second.nonce);
     dir->state_clear(CDir::STATE_REJOINING);
@@ -4467,7 +4514,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
 	 q != dmap.end();
 	 ++q) {
       CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
-      assert(dn);
+      if(!dn)
+	dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
+
       CDentry::linkage_t *dnl = dn->get_linkage();
 
       assert(dn->last == q->first.snapid);
@@ -4476,33 +4525,48 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
 	dn->first = q->second.first;
       }
 
+      // may have bad linkage if we missed dentry link/unlink messages
+      if (dnl->is_primary()) {
+	CInode *in = dnl->get_inode();
+	if (!q->second.is_primary() ||
+	    !(vinodeno_t(q->second.ino, q->first.snapid) == in->vino())) {
+	  dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
+	  dir->unlink_inode(dn);
+	}
+      } else if (dnl->is_remote()) {
+	if (!q->second.is_remote() ||
+	    q->second.remote_ino != dnl->get_remote_ino() ||
+	    q->second.remote_d_type != dnl->get_remote_d_type()) {
+	  dout(10) << " had bad linkage for " << *dn <<  dendl;
+	  dir->unlink_inode(dn);
+	}
+      } else {
+	if (!q->second.is_null())
+	  dout(10) << " had bad linkage for " << *dn <<  dendl;
+      }
+
       // hmm, did we have the proper linkage here?
-      if (dnl->is_null() &&
-	  !q->second.is_null()) {
-	dout(10) << " had bad (missing) linkage for " << *dn << dendl;
+      if (dnl->is_null() && !q->second.is_null()) {
 	if (q->second.is_remote()) {
 	  dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
 	} else {
 	  CInode *in = get_inode(q->second.ino, q->first.snapid);
-	  assert(in == 0);  // a rename would have been caught be the resolve stage.
-	  // barebones inode; the full inode loop below will clean up.
-	  in = new CInode(this, false, q->second.first, q->first.snapid);
-	  in->inode.ino = q->second.ino;
-	  add_inode(in);
+	  if (!in) {
+	    // barebones inode; assume it's dir, the full inode loop below will clean up.
+	    in = new CInode(this, false, q->second.first, q->first.snapid);
+	    in->inode.ino = q->second.ino;
+	    in->inode.mode = S_IFDIR;
+	    add_inode(in);
+	    dout(10) << " add inode " << *in << dendl;
+	  } else if (in->get_parent_dn()) {
+	    dout(10) << " had bad linkage for " << *(in->get_parent_dn())
+		     << ", unlinking " << *in << dendl;
+	    in->get_parent_dir()->unlink_inode(in->get_parent_dn());
+	  }
 	  dn->dir->link_primary_inode(dn, in);
 	}
       }
-      else if (!dnl->is_null() &&
-	       q->second.is_null()) {
-	dout(0) << " had bad linkage for " << *dn << dendl;
-	/* 
-	 * this should happen:
-	 *  if we're a survivor, any unlink should commit or rollback during
-	 * the resolve stage.
-	 *  if we failed, we shouldn't have non-auth leaf dentries at all
-	 */
-	assert(0);  // uh oh.	
-      }
+
       dn->set_replica_nonce(q->second.nonce);
       dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
       dn->state_clear(CDentry::STATE_REJOINING);
@@ -4564,6 +4628,21 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
     dout(10) << " got inode locks " << *in << dendl;
   }
 
+  // trim unconnected subtree
+  if (!isolated_inodes.empty()) {
+    map<int, MCacheExpire*> expiremap;
+    for (list<CInode*>::iterator p = isolated_inodes.begin();
+	 p != isolated_inodes.end();
+	 ++p) {
+      list<CDir*> ls;
+      (*p)->get_dirfrags(ls);
+      trim_dirfrag(*ls.begin(), 0, expiremap);
+      assert((*p)->get_num_ref() == 0);
+      delete *p;
+    }
+    send_expire_messages(expiremap);
+  }
+
   // done?
   assert(rejoin_ack_gather.count(from));
   rejoin_ack_gather.erase(from);
@@ -5164,6 +5243,37 @@ void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snap
 void MDCache::rejoin_send_acks()
 {
   dout(7) << "rejoin_send_acks" << dendl;
+
+  // replicate stray
+  for (map<int, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
+       p != rejoin_unlinked_inodes.end();
+       ++p) {
+    for (set<CInode*>::iterator q = p->second.begin();
+	 q != p->second.end();
+	 ++q) {
+      CInode *in = *q;
+      dout(7) << " unlinked inode " << *in << dendl;
+      // inode expired
+      if (!in->is_replica(p->first))
+	continue;
+      while (1) {
+	CDentry *dn = in->get_parent_dn();
+	if (dn->is_replica(p->first))
+	  break;
+	dn->add_replica(p->first);
+	CDir *dir = dn->get_dir();
+	if (dir->is_replica(p->first))
+	  break;
+	dir->add_replica(p->first);
+	in = dir->get_inode();
+	if (in->is_replica(p->first))
+	  break;
+	if (in->is_base())
+	  break;
+      }
+    }
+  }
+  rejoin_unlinked_inodes.clear();
   
   // send acks to everyone in the recovery set
   map<int,MMDSCacheRejoin*> ack;
@@ -5203,23 +5313,29 @@ void MDCache::rejoin_send_acks()
 	CDentry *dn = q->second;
 	CDentry::linkage_t *dnl = dn->get_linkage();
 
+	// inode
+	CInode *in = NULL;
+	if (dnl->is_primary())
+	  in = dnl->get_inode();
+
 	// dentry
 	for (map<int,int>::iterator r = dn->replicas_begin();
 	     r != dn->replicas_end();
-	     ++r) 
+	     ++r) {
 	  ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
 					   dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
 					   dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
 					   dnl->is_remote() ? dnl->get_remote_d_type():0,
 					   ++r->second,
 					   dn->lock.get_replica_state());
+	  // peer missed MDentrylink message ?
+	  if (in && !in->is_replica(r->first))
+	    in->add_replica(r->first);
+	}
 	
-	if (!dnl->is_primary())
+	if (!in)
 	  continue;
 
-	// inode
-	CInode *in = dnl->get_inode();
-
 	for (map<int,int>::iterator r = in->replicas_begin();
 	     r != in->replicas_end();
 	     ++r) {
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 85f5d65..09cc092 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -416,6 +416,7 @@ protected:
   set<CInode*> rejoin_undef_inodes;
   set<CInode*> rejoin_potential_updated_scatterlocks;
   set<CDir*>   rejoin_undef_dirfrags;
+  map<int, set<CInode*> > rejoin_unlinked_inodes;
 
   vector<CInode*> rejoin_recover_q, rejoin_check_q;
   list<Context*> rejoin_waiters;
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 23/39] mds: reqid for rejoinning authpin/wrlock need to be list
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (21 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 22/39] mds: handle linkage mismatch during cache rejoin Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-20 23:59   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 24/39] mds: take object's versionlock when rejoinning xlock Yan, Zheng
                   ` (17 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc             | 78 ++++++++++++++++++++++++------------------
 src/messages/MMDSCacheRejoin.h | 12 +++----
 2 files changed, 50 insertions(+), 40 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 38b1fdf..f4622de 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -4305,16 +4305,19 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
       // dn auth_pin?
       if (strong->authpinned_dentries.count(p->first) &&
 	  strong->authpinned_dentries[p->first].count(q->first)) {
-	MMDSCacheRejoin::slave_reqid r = strong->authpinned_dentries[p->first][q->first];
-	dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
-	
-	// get/create slave mdrequest
-	MDRequest *mdr;
-	if (have_request(r.reqid))
-	  mdr = request_get(r.reqid);
-	else
-	  mdr = request_start_slave(r.reqid, r.attempt, from);
-	mdr->auth_pin(dn);
+	for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
+	     r != strong->authpinned_dentries[p->first][q->first].end();
+	     ++r) {
+	  dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
+
+	  // get/create slave mdrequest
+	  MDRequest *mdr;
+	  if (have_request(r->reqid))
+	    mdr = request_get(r->reqid);
+	  else
+	    mdr = request_start_slave(r->reqid, r->attempt, from);
+	  mdr->auth_pin(dn);
+	}
       }
 
       // dn xlock?
@@ -4389,22 +4392,25 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 
     // auth pin?
     if (strong->authpinned_inodes.count(in->vino())) {
-      MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
-      dout(10) << " inode authpin by " << r << " on " << *in << dendl;
+      for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
+	   r != strong->authpinned_inodes[in->vino()].end();
+	   ++r) {
+	dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
 
-      // get/create slave mdrequest
-      MDRequest *mdr;
-      if (have_request(r.reqid))
-	mdr = request_get(r.reqid);
-      else
-	mdr = request_start_slave(r.reqid, r.attempt, from);
-      if (strong->frozen_authpin_inodes.count(in->vino())) {
-	assert(!in->get_num_auth_pins());
-	mdr->freeze_auth_pin(in);
-      } else {
-	assert(!in->is_frozen_auth_pin());
+	// get/create slave mdrequest
+	MDRequest *mdr;
+	if (have_request(r->reqid))
+	  mdr = request_get(r->reqid);
+	else
+	  mdr = request_start_slave(r->reqid, r->attempt, from);
+	if (strong->frozen_authpin_inodes.count(in->vino())) {
+	  assert(!in->get_num_auth_pins());
+	  mdr->freeze_auth_pin(in);
+	} else {
+	  assert(!in->is_frozen_auth_pin());
+	}
+	mdr->auth_pin(in);
       }
-      mdr->auth_pin(in);
     }
     // xlock(s)?
     if (strong->xlocked_inodes.count(in->vino())) {
@@ -4427,19 +4433,23 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
     }
     // wrlock(s)?
     if (strong->wrlocked_inodes.count(in->vino())) {
-      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->wrlocked_inodes[in->vino()].begin();
+      for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = strong->wrlocked_inodes[in->vino()].begin();
 	   q != strong->wrlocked_inodes[in->vino()].end();
 	   ++q) {
 	SimpleLock *lock = in->get_lock(q->first);
-	dout(10) << " inode wrlock by " << q->second << " on " << *lock << " on " << *in << dendl;
-	MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
-	assert(mdr->is_auth_pinned(in));
-	lock->set_state(LOCK_LOCK);
-	if (lock == &in->filelock)
-	  in->loner_cap = -1;
-	lock->get_wrlock(true);
-	mdr->wrlocks.insert(lock);
-	mdr->locks.insert(lock);
+	for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
+	     r != q->second.end();
+	     ++r) {
+	  dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
+	  MDRequest *mdr = request_get(r->reqid);  // should have this from auth_pin above.
+	  assert(mdr->is_auth_pinned(in));
+	  lock->set_state(LOCK_MIX);
+	  if (lock == &in->filelock)
+	    in->loner_cap = -1;
+	  lock->get_wrlock(true);
+	  mdr->wrlocks.insert(lock);
+	  mdr->locks.insert(lock);
+	}
       }
     }
   }
diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
index 7c37ab4..f29e8f5 100644
--- a/src/messages/MMDSCacheRejoin.h
+++ b/src/messages/MMDSCacheRejoin.h
@@ -188,11 +188,11 @@ class MMDSCacheRejoin : public Message {
       ::decode(attempt, bl);
     }
   };
-  map<vinodeno_t, slave_reqid> authpinned_inodes;
+  map<vinodeno_t, list<slave_reqid> > authpinned_inodes;
   map<vinodeno_t, slave_reqid> frozen_authpin_inodes;
   map<vinodeno_t, map<__s32, slave_reqid> > xlocked_inodes;
-  map<vinodeno_t, map<__s32, slave_reqid> > wrlocked_inodes;
-  map<dirfrag_t, map<string_snap_t, slave_reqid> > authpinned_dentries;
+  map<vinodeno_t, map<__s32, list<slave_reqid> > > wrlocked_inodes;
+  map<dirfrag_t, map<string_snap_t, list<slave_reqid> > > authpinned_dentries;
   map<dirfrag_t, map<string_snap_t, slave_reqid> > xlocked_dentries;
   
   MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {}
@@ -232,7 +232,7 @@ public:
     ::encode(bl, inode_base);
   }
   void add_inode_authpin(vinodeno_t ino, const metareqid_t& ri, __u32 attempt) {
-    authpinned_inodes[ino] = slave_reqid(ri, attempt);
+    authpinned_inodes[ino].push_back(slave_reqid(ri, attempt));
   }
   void add_inode_frozen_authpin(vinodeno_t ino, const metareqid_t& ri, __u32 attempt) {
     frozen_authpin_inodes[ino] = slave_reqid(ri, attempt);
@@ -241,7 +241,7 @@ public:
     xlocked_inodes[ino][lt] = slave_reqid(ri, attempt);
   }
   void add_inode_wrlock(vinodeno_t ino, int lt, const metareqid_t& ri, __u32 attempt) {
-    wrlocked_inodes[ino][lt] = slave_reqid(ri, attempt);
+    wrlocked_inodes[ino][lt].push_back(slave_reqid(ri, attempt));
   }
 
   void add_scatterlock_state(CInode *in) {
@@ -282,7 +282,7 @@ public:
   }
   void add_dentry_authpin(dirfrag_t df, const string& dname, snapid_t last,
 			  const metareqid_t& ri, __u32 attempt) {
-    authpinned_dentries[df][string_snap_t(dname, last)] = slave_reqid(ri, attempt);
+    authpinned_dentries[df][string_snap_t(dname, last)].push_back(slave_reqid(ri, attempt));
   }
   void add_dentry_xlock(dirfrag_t df, const string& dname, snapid_t last,
 			const metareqid_t& ri, __u32 attempt) {
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 24/39] mds: take object's versionlock when rejoinning xlock
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (22 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 23/39] mds: reqid for rejoinning authpin/wrlock need to be list Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  0:37   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 25/39] mds: share inode max size after MDS recovers Yan, Zheng
                   ` (16 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index f4622de..194f983 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -4327,6 +4327,12 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
 	MDRequest *mdr = request_get(r.reqid);  // should have this from auth_pin above.
 	assert(mdr->is_auth_pinned(dn));
+	if (!mdr->xlocks.count(&dn->versionlock)) {
+	  assert(dn->versionlock.can_xlock_local());
+	  dn->versionlock.get_xlock(mdr, mdr->get_client());
+	  mdr->xlocks.insert(&dn->versionlock);
+	  mdr->locks.insert(&dn->versionlock);
+	}
 	if (dn->lock.is_stable())
 	  dn->auth_pin(&dn->lock);
 	dn->lock.set_state(LOCK_XLOCK);
@@ -4421,6 +4427,12 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
 	MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
 	assert(mdr->is_auth_pinned(in));
+	if (!mdr->xlocks.count(&in->versionlock)) {
+	  assert(in->versionlock.can_xlock_local());
+	  in->versionlock.get_xlock(mdr, mdr->get_client());
+	  mdr->xlocks.insert(&in->versionlock);
+	  mdr->locks.insert(&in->versionlock);
+	}
 	if (lock->is_stable())
 	  in->auth_pin(lock);
 	lock->set_state(LOCK_XLOCK);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 25/39] mds: share inode max size after MDS recovers
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (23 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 24/39] mds: take object's versionlock when rejoinning xlock Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  0:45   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 26/39] mds: issue caps when lock state in replica become SYNC Yan, Zheng
                   ` (15 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

The MDS may crash after journaling the new max size, but before sending
the new max size to the client. Later when the MDS recovers, the client
re-requests the new max size, but the MDS finds max size unchanged. So
the client waits for the new max size forever. This issue can be avoided
by checking client cap's last_sent, share inode max size if it is zero.
(reconnected cap's last_sent is zero)

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Locker.cc  | 18 ++++++++++++++----
 src/mds/Locker.h   |  2 +-
 src/mds/MDCache.cc |  2 ++
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 0055a19..4d45f99 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2089,7 +2089,7 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
 }
 
 
-void Locker::share_inode_max_size(CInode *in)
+void Locker::share_inode_max_size(CInode *in, Capability *only_cap)
 {
   /*
    * only share if currently issued a WR cap.  if client doesn't have it,
@@ -2097,9 +2097,12 @@ void Locker::share_inode_max_size(CInode *in)
    * the cap later.
    */
   dout(10) << "share_inode_max_size on " << *in << dendl;
-  for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
-       it != in->client_caps.end();
-       ++it) {
+  map<client_t, Capability*>::iterator it;
+  if (only_cap)
+    it = in->client_caps.find(only_cap->get_client());
+  else
+    it = in->client_caps.begin();
+  for (; it != in->client_caps.end(); ++it) {
     const client_t client = it->first;
     Capability *cap = it->second;
     if (cap->is_suppress())
@@ -2115,6 +2118,8 @@ void Locker::share_inode_max_size(CInode *in)
       in->encode_cap_message(m, cap);
       mds->send_message_client_counted(m, client);
     }
+    if (only_cap)
+      break;
   }
 }
 
@@ -2398,6 +2403,11 @@ void Locker::handle_client_caps(MClientCaps *m)
       bool did_issue = eval(in, CEPH_CAP_LOCKS);
       if (!did_issue && (cap->wanted() & ~cap->pending()))
 	issue_caps(in, cap);
+      if (cap->get_last_seq() == 0 &&
+	  (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER))) {
+	cap->issue_norevoke(cap->issued());
+	share_inode_max_size(in, cap);
+      }
     }
   }
 
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index 3f79996..d98104f 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -276,7 +276,7 @@ public:
   void calc_new_client_ranges(CInode *in, uint64_t size, map<client_t, client_writeable_range_t>& new_ranges);
   bool check_inode_max_size(CInode *in, bool force_wrlock=false, bool update_size=false, uint64_t newsize=0,
 			    utime_t mtime=utime_t());
-  void share_inode_max_size(CInode *in);
+  void share_inode_max_size(CInode *in, Capability *only_cap=0);
 
 private:
   friend class C_MDL_CheckMaxSize;
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 194f983..459b400 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -5073,6 +5073,8 @@ void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap)
   SnapRealm *realm = in->find_snaprealm();
   if (realm->have_past_parents_open()) {
     dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
+    if (cap->get_last_seq() == 0)
+      cap->issue_norevoke(cap->issued()); // reconnected cap
     cap->set_last_issue();
     MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
 					in->ino(),
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 26/39] mds: issue caps when lock state in replica become SYNC
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (24 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 25/39] mds: share inode max size after MDS recovers Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  0:52   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 27/39] mds: send lock action message when auth MDS is in proper state Yan, Zheng
                   ` (14 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

because client can request READ caps from non-auth MDS.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Locker.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 4d45f99..28920d4 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -4403,6 +4403,8 @@ void Locker::handle_file_lock(ScatterLock *lock, MLock *m)
     lock->set_state(LOCK_SYNC);
 
     lock->get_rdlock();
+    if (caps)
+      issue_caps(in);
     lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
     lock->put_rdlock();
     break;
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 27/39] mds: send lock action message when auth MDS is in proper state.
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (25 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 26/39] mds: issue caps when lock state in replica become SYNC Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:12   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 28/39] mds: add dirty imported dirfrag to LogSegment Yan, Zheng
                   ` (13 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

For rejoining object, don't send lock ACK message because lock states
are still uncertain. The lock ACK may confuse object's auth MDS and
trigger assertion.

If object's auth MDS is not active, just skip sending NUDGE, REQRDLOCK
and REQSCATTER messages. MDCache::handle_mds_recovery() will take care
of them.

Also defer caps release message until clientreplay or active

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Locker.cc  | 46 ++++++++++++++++++++++++++++++----------------
 src/mds/MDCache.cc | 13 +++++++++++--
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 28920d4..ece39e3 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -658,6 +658,13 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<C
       // replica: tell auth
       int auth = lock->get_parent()->authority().first;
 
+      if (lock->get_parent()->is_rejoining() &&
+	  mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
+	dout(7) << "eval_gather finished gather, but still rejoining "
+		<< *lock->get_parent() << dendl;
+	return;
+      }
+
       if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
 	switch (lock->get_state()) {
 	case LOCK_SYNC_LOCK:
@@ -1050,9 +1057,11 @@ bool Locker::_rdlock_kick(SimpleLock *lock, bool as_anon)
     } else {
       // request rdlock state change from auth
       int auth = lock->get_parent()->authority().first;
-      dout(10) << "requesting rdlock from auth on " 
-	       << *lock << " on " << *lock->get_parent() << dendl;
-      mds->send_message_mds(new MLock(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth);
+      if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+	dout(10) << "requesting rdlock from auth on "
+		 << *lock << " on " << *lock->get_parent() << dendl;
+	mds->send_message_mds(new MLock(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth);
+      }
       return false;
     }
   }
@@ -1272,9 +1281,11 @@ bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait)
       // replica.
       // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case).
       int auth = lock->get_parent()->authority().first;
-      dout(10) << "requesting scatter from auth on " 
-	       << *lock << " on " << *lock->get_parent() << dendl;
-      mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth);
+      if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
+	dout(10) << "requesting scatter from auth on "
+		 << *lock << " on " << *lock->get_parent() << dendl;
+	mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth);
+      }
       break;
     }
   }
@@ -1899,13 +1910,19 @@ void Locker::request_inode_file_caps(CInode *in)
     }
 
     int auth = in->authority().first;
+    if (in->is_rejoining() &&
+	mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
+      mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in));
+      return;
+    }
+
     dout(7) << "request_inode_file_caps " << ccap_string(wanted)
             << " was " << ccap_string(in->replica_caps_wanted) 
             << " on " << *in << " to mds." << auth << dendl;
 
     in->replica_caps_wanted = wanted;
 
-    if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN)
+    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth))
       mds->send_message_mds(new MInodeFileCaps(in->ino(), in->replica_caps_wanted),
 			    auth);
   }
@@ -1924,14 +1941,6 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m)
   assert(in);
   assert(in->is_auth());
 
-  if (mds->is_rejoin() &&
-      in->is_rejoining()) {
-    dout(7) << "handle_inode_file_caps still rejoining " << *in << ", dropping " << *m << dendl;
-    m->put();
-    return;
-  }
-
-  
   dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl;
 
   if (m->get_caps())
@@ -2850,6 +2859,11 @@ void Locker::handle_client_cap_release(MClientCapRelease *m)
   client_t client = m->get_source().num();
   dout(10) << "handle_client_cap_release " << *m << dendl;
 
+  if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
+    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+    return;
+  }
+
   for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p) {
     inodeno_t ino((uint64_t)p->ino);
     CInode *in = mdcache->get_inode(ino);
@@ -3859,7 +3873,7 @@ void Locker::scatter_nudge(ScatterLock *lock, Context *c, bool forcelockchange)
 	     << *lock << " on " << *p << dendl;
     // request unscatter?
     int auth = lock->get_parent()->authority().first;
-    if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_ACTIVE)
+    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth))
       mds->send_message_mds(new MLock(lock, LOCK_AC_NUDGE, mds->get_nodeid()), auth);
 
     // wait...
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 459b400..973a4d0 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3321,8 +3321,10 @@ void MDCache::recalc_auth_bits()
 
   if (root) {
     root->inode_auth.first = mds->mdsmap->get_root();
-    if (mds->whoami != root->inode_auth.first)
+    if (mds->whoami != root->inode_auth.first) {
       root->state_clear(CInode::STATE_AUTH);
+      root->state_set(CInode::STATE_REJOINING);
+    }
   }
 
   set<CInode*> subtree_inodes;
@@ -3336,8 +3338,10 @@ void MDCache::recalc_auth_bits()
        ++p) {
 
     CInode *inode = p->first->get_inode();
-    if (inode->is_mdsdir() && inode->ino() != MDS_INO_MDSDIR(mds->get_nodeid()))
+    if (inode->is_mdsdir() && inode->ino() != MDS_INO_MDSDIR(mds->get_nodeid())) {
       inode->state_clear(CInode::STATE_AUTH);
+      inode->state_set(CInode::STATE_REJOINING);
+    }
 
     list<CDir*> dfq;  // dirfrag queue
     dfq.push_back(p->first);
@@ -3542,6 +3546,7 @@ void MDCache::rejoin_send_rejoins()
 				    root->filelock.get_state(),
 				    root->nestlock.get_state(),
 				    root->dirfragtreelock.get_state());
+	root->state_set(CInode::STATE_REJOINING);
 	if (root->is_dirty_scattered()) {
 	  dout(10) << " sending scatterlock state on root " << *root << dendl;
 	  p->second->add_scatterlock_state(root);
@@ -3555,6 +3560,7 @@ void MDCache::rejoin_send_rejoins()
 				    in->filelock.get_state(),
 				    in->nestlock.get_state(),
 				    in->dirfragtreelock.get_state());
+	in->state_set(CInode::STATE_REJOINING);
       }
     }
   }  
@@ -3694,6 +3700,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
     // STRONG
     dout(15) << " add_strong_dirfrag " << *dir << dendl;
     rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
+    dir->state_set(CDir::STATE_REJOINING);
 
     for (CDir::map_t::iterator p = dir->items.begin();
 	 p != dir->items.end();
@@ -3707,6 +3714,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
 				dnl->is_remote() ? dnl->get_remote_d_type():0, 
 				dn->get_replica_nonce(),
 				dn->lock.get_state());
+      dn->state_set(CDentry::STATE_REJOINING);
       if (dnl->is_primary()) {
 	CInode *in = dnl->get_inode();
 	dout(15) << " add_strong_inode " << *in << dendl;
@@ -3716,6 +3724,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
 				 in->filelock.get_state(),
 				 in->nestlock.get_state(),
 				 in->dirfragtreelock.get_state());
+	in->state_set(CInode::STATE_REJOINING);
 	in->get_nested_dirfrags(nested);
 	if (in->is_dirty_scattered()) {
 	  dout(10) << " sending scatterlock state on " << *in << dendl;
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 28/39] mds: add dirty imported dirfrag to LogSegment
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (26 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 27/39] mds: send lock action message when auth MDS is in proper state Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:14   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 29/39] mds: avoid double auth pin for file recovery Yan, Zheng
                   ` (12 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/CDir.cc     | 7 +++++--
 src/mds/CDir.h      | 2 +-
 src/mds/Migrator.cc | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index af0ae9c..34bd8d3 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -2164,7 +2164,7 @@ void CDir::finish_export(utime_t now)
   dirty_old_rstat.clear();
 }
 
-void CDir::decode_import(bufferlist::iterator& blp, utime_t now)
+void CDir::decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls)
 {
   ::decode(first, blp);
   ::decode(fnode, blp);
@@ -2177,7 +2177,10 @@ void CDir::decode_import(bufferlist::iterator& blp, utime_t now)
   ::decode(s, blp);
   state &= MASK_STATE_IMPORT_KEPT;
   state |= (s & MASK_STATE_EXPORTED);
-  if (is_dirty()) get(PIN_DIRTY);
+  if (is_dirty()) {
+    get(PIN_DIRTY);
+    _mark_dirty(ls);
+  }
 
   ::decode(dir_rep, blp);
 
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index f4a3a3d..7e1db73 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -550,7 +550,7 @@ public:
   void abort_export() { 
     put(PIN_TEMPEXPORTING);
   }
-  void decode_import(bufferlist::iterator& blp, utime_t now);
+  void decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls);
 
 
   // -- auth pins --
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 833df12..d626cb1 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -2397,7 +2397,7 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp,
   dout(7) << "decode_import_dir " << *dir << dendl;
 
   // assimilate state
-  dir->decode_import(blp, now);
+  dir->decode_import(blp, now, ls);
 
   // mark  (may already be marked from get_or_open_dir() above)
   if (!dir->is_auth())
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 29/39] mds: avoid double auth pin for file recovery
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (27 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 28/39] mds: add dirty imported dirfrag to LogSegment Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:20   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 30/39] mds: check MDS peer's state through mdsmap Yan, Zheng
                   ` (11 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 973a4d0..e9a79cd 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -5502,8 +5502,10 @@ void MDCache::_queue_file_recover(CInode *in)
   dout(15) << "_queue_file_recover " << *in << dendl;
   assert(in->is_auth());
   in->state_clear(CInode::STATE_NEEDSRECOVER);
-  in->state_set(CInode::STATE_RECOVERING);
-  in->auth_pin(this);
+  if (!in->state_test(CInode::STATE_RECOVERING)) {
+    in->state_set(CInode::STATE_RECOVERING);
+    in->auth_pin(this);
+  }
   file_recover_queue.insert(in);
 }
 
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 30/39] mds: check MDS peer's state through mdsmap
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (28 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 29/39] mds: avoid double auth pin for file recovery Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:24   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 31/39] mds: unfreeze subtree if import aborts in PREPPED state Yan, Zheng
                   ` (10 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Migrator.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index d626cb1..143d71e 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -238,7 +238,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
 	export_unlock(dir);
 	export_locks.erase(dir);
 	dir->state_clear(CDir::STATE_EXPORTING);
-	if (export_peer[dir] != who) // tell them.
+	if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
 	  mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
 	break;
 	
@@ -247,7 +247,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
 	dir->unfreeze_tree();  // cancel the freeze
 	export_state.erase(dir); // clean up
 	dir->state_clear(CDir::STATE_EXPORTING);
-	if (export_peer[dir] != who) // tell them.
+	if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
 	  mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
 	break;
 
@@ -278,7 +278,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
 	export_unlock(dir);
 	export_locks.erase(dir);
 	dir->state_clear(CDir::STATE_EXPORTING);
-	if (export_peer[dir] != who) // tell them.
+	if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
 	  mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
 	break;
 	
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 31/39] mds: unfreeze subtree if import aborts in PREPPED state
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (29 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 30/39] mds: check MDS peer's state through mdsmap Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:27   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 32/39] mds: fix export cancel notification Yan, Zheng
                   ` (9 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Migrator.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 143d71e..963706c 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -1658,11 +1658,14 @@ void Migrator::handle_export_cancel(MExportDirCancel *m)
     CInode *in = cache->get_inode(df.ino);
     assert(in);
     import_reverse_discovered(df, in);
-  } else if (import_state[df] == IMPORT_PREPPING ||
-	     import_state[df] == IMPORT_PREPPED) {
+  } else if (import_state[df] == IMPORT_PREPPING) {
     CDir *dir = mds->mdcache->get_dirfrag(df);
     assert(dir);
     import_reverse_prepping(dir);
+  } else if (import_state[df] == IMPORT_PREPPED) {
+    CDir *dir = mds->mdcache->get_dirfrag(df);
+    assert(dir);
+    import_reverse_unfreeze(dir);
   } else {
     assert(0 == "got export_cancel in weird state");
   }
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 32/39] mds: fix export cancel notification
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (30 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 31/39] mds: unfreeze subtree if import aborts in PREPPED state Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:31   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 33/39] mds: notify bystanders if export aborts Yan, Zheng
                   ` (8 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

The comment says that if the importer is dead, bystanders thinks the
exporter is the only auth, as per mdcache->handle_mds_failure(). But
there is no such code in MDCache::handle_mds_failure().

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Migrator.cc | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 963706c..40a5394 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -1390,17 +1390,9 @@ void Migrator::export_logged_finish(CDir *dir)
   for (set<int>::iterator p = export_notify_ack_waiting[dir].begin();
        p != export_notify_ack_waiting[dir].end();
        ++p) {
-    MExportDirNotify *notify;
-    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) 
-      // dest is still alive.
-      notify = new MExportDirNotify(dir->dirfrag(), true,
-				    pair<int,int>(mds->get_nodeid(), dest),
-				    pair<int,int>(dest, CDIR_AUTH_UNKNOWN));
-    else 
-      // dest is dead.  bystanders will think i am only auth, as per mdcache->handle_mds_failure()
-      notify = new MExportDirNotify(dir->dirfrag(), true,
-				    pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN),
-				    pair<int,int>(dest, CDIR_AUTH_UNKNOWN));
+    MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), true,
+						    pair<int,int>(mds->get_nodeid(), dest),
+						    pair<int,int>(dest, CDIR_AUTH_UNKNOWN));
 
     for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
       notify->get_bounds().push_back((*i)->dirfrag());
@@ -2115,11 +2107,9 @@ void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
   for (set<int>::iterator p = import_bystanders[dir].begin();
        p != import_bystanders[dir].end();
        ++p) {
-    // NOTE: the bystander will think i am _only_ auth, because they will have seen
-    // the exporter's failure and updated the subtree auth.  see mdcache->handle_mds_failure().
-    MExportDirNotify *notify = 
+    MExportDirNotify *notify =
       new MExportDirNotify(dir->dirfrag(), true,
-			   pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN),
+			   pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
 			   pair<int,int>(import_peer[dir->dirfrag()], CDIR_AUTH_UNKNOWN));
     for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
       notify->get_bounds().push_back((*i)->dirfrag());
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 33/39] mds: notify bystanders if export aborts
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (31 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 32/39] mds: fix export cancel notification Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:34   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 34/39] mds: don't open dirfrag while subtree is frozen Yan, Zheng
                   ` (7 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

So bystanders know the subtree is single auth earlier.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Migrator.cc | 34 ++++++++++++++++++++++++++--------
 src/mds/Migrator.h  |  1 +
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 40a5394..0672d03 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -251,25 +251,28 @@ void Migrator::handle_mds_failure_or_stop(int who)
 	  mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
 	break;
 
-	// NOTE: state order reversal, warning comes after loggingstart+prepping
+	// NOTE: state order reversal, warning comes after prepping
       case EXPORT_WARNING:
 	dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
 	// fall-thru
 
       case EXPORT_PREPPING:
 	if (p->second != EXPORT_WARNING) 
-	  dout(10) << "export state=loggingstart|prepping : unpinning bounds, unfreezing" << dendl;
+	  dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
 	{
 	  // unpin bounds
 	  set<CDir*> bounds;
 	  cache->get_subtree_bounds(dir, bounds);
-	  for (set<CDir*>::iterator p = bounds.begin();
-	       p != bounds.end();
-	       ++p) {
-	    CDir *bd = *p;
+	  for (set<CDir*>::iterator q = bounds.begin();
+	       q != bounds.end();
+	       ++q) {
+	    CDir *bd = *q;
 	    bd->put(CDir::PIN_EXPORTBOUND);
 	    bd->state_clear(CDir::STATE_EXPORTBOUND);
 	  }
+	  // notify bystanders
+	  if (p->second == EXPORT_WARNING)
+	    export_notify_abort(dir, bounds);
 	}
 	dir->unfreeze_tree();
 	export_state.erase(dir); // clean up
@@ -1307,9 +1310,21 @@ void Migrator::handle_export_ack(MExportDirAck *m)
   m->put();
 }
 
+void Migrator::export_notify_abort(CDir *dir, set<CDir*>& bounds)
+{
+  dout(7) << "export_notify_abort " << *dir << dendl;
 
-
-
+  for (set<int>::iterator p = export_notify_ack_waiting[dir].begin();
+       p != export_notify_ack_waiting[dir].end();
+       ++p) {
+    MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), false,
+						    pair<int,int>(mds->get_nodeid(),export_peer[dir]),
+						    pair<int,int>(mds->get_nodeid(),CDIR_AUTH_UNKNOWN));
+    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
+      notify->get_bounds().push_back((*i)->dirfrag());
+    mds->send_message_mds(notify, *p);
+  }
+}
 
 /*
  * this happens if hte dest failes after i send teh export data but before it is acked
@@ -1356,6 +1371,9 @@ void Migrator::export_reverse(CDir *dir)
     bd->state_clear(CDir::STATE_EXPORTBOUND);
   }
 
+  // notify bystanders
+  export_notify_abort(dir, bounds);
+
   // process delayed expires
   cache->process_delayed_expire(dir);
   
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index 2889a74..f395bc1 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -227,6 +227,7 @@ public:
   void export_go(CDir *dir);
   void export_go_synced(CDir *dir);
   void export_reverse(CDir *dir);
+  void export_notify_abort(CDir *dir, set<CDir*>& bounds);
   void handle_export_ack(MExportDirAck *m);
   void export_logged_finish(CDir *dir);
   void handle_export_notify_ack(MExportDirNotifyAck *m);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 34/39] mds: don't open dirfrag while subtree is frozen
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (32 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 33/39] mds: notify bystanders if export aborts Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:38   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 35/39] mds: clear dirty inode rstat if import fails Yan, Zheng
                   ` (6 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index e9a79cd..30687ec 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -7101,9 +7101,9 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin,     // wh
     if (!curdir) {
       if (cur->is_auth()) {
         // parent dir frozen_dir?
-        if (cur->is_frozen_dir()) {
-          dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << dendl;
-          cur->get_parent_dn()->get_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
+        if (cur->is_frozen()) {
+          dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
+          cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
           return 1;
         }
         curdir = cur->get_or_open_dirfrag(this, fg);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 35/39] mds: clear dirty inode rstat if import fails
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (33 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 34/39] mds: don't open dirfrag while subtree is frozen Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:40   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 36/39] mds: try merging subtree after clear EXPORTBOUND Yan, Zheng
                   ` (5 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/CDir.cc     | 1 +
 src/mds/Migrator.cc | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 34bd8d3..47b6753 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1022,6 +1022,7 @@ void CDir::assimilate_dirty_rstat_inodes()
   for (elist<CInode*>::iterator p = dirty_rstat_inodes.begin_use_current();
        !p.end(); ++p) {
     CInode *in = *p;
+    assert(in->is_auth());
     if (in->is_frozen())
       continue;
 
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 0672d03..f563b8d 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -2052,6 +2052,8 @@ void Migrator::import_reverse(CDir *dir)
 	in->clear_replica_map();
 	if (in->is_dirty()) 
 	  in->mark_clean();
+	in->clear_dirty_rstat();
+
 	in->authlock.clear_gather();
 	in->linklock.clear_gather();
 	in->dirfragtreelock.clear_gather();
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 36/39] mds: try merging subtree after clear EXPORTBOUND
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (34 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 35/39] mds: clear dirty inode rstat if import fails Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:44   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 37/39] mds: eval inodes with caps imported by cache rejoin message Yan, Zheng
                   ` (4 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Migrator.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index f563b8d..9cbad87 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -1340,10 +1340,6 @@ void Migrator::export_reverse(CDir *dir)
   set<CDir*> bounds;
   cache->get_subtree_bounds(dir, bounds);
 
-  // adjust auth, with possible subtree merge.
-  cache->adjust_subtree_auth(dir, mds->get_nodeid());
-  cache->try_subtree_merge(dir);  // NOTE: may journal subtree_map as side-effect
-
   // remove exporting pins
   list<CDir*> rq;
   rq.push_back(dir);
@@ -1371,6 +1367,10 @@ void Migrator::export_reverse(CDir *dir)
     bd->state_clear(CDir::STATE_EXPORTBOUND);
   }
 
+  // adjust auth, with possible subtree merge.
+  cache->adjust_subtree_auth(dir, mds->get_nodeid());
+  cache->try_subtree_merge(dir);  // NOTE: may journal subtree_map as side-effect
+
   // notify bystanders
   export_notify_abort(dir, bounds);
 
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 37/39] mds: eval inodes with caps imported by cache rejoin message
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (35 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 36/39] mds: try merging subtree after clear EXPORTBOUND Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:45   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 38/39] mds: don't replicate purging dentry Yan, Zheng
                   ` (3 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 30687ec..24f1109 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3823,6 +3823,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
 	dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
 	rejoin_import_cap(in, q->first, q->second, from);
       }
+      mds->locker->eval(in, CEPH_CAP_LOCKS, true);
     }
   } else {
     assert(mds->is_rejoin());
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 38/39] mds: don't replicate purging dentry
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (36 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 37/39] mds: eval inodes with caps imported by cache rejoin message Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:46   ` Gregory Farnum
  2013-03-17 14:51 ` [PATCH 39/39] mds: clear scatter dirty if replica inode has no auth subtree Yan, Zheng
                   ` (2 subsequent siblings)
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

open_remote_ino is racy, it's possible someone deletes the inode's
last linkage while the MDS is discovering the inode.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 24f1109..d730ff1 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -9225,8 +9225,15 @@ void MDCache::handle_discover(MDiscover *dis)
     if (dis->get_want_ino()) {
       // lookup by ino
       CInode *in = get_inode(dis->get_want_ino(), snapid);
-      if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir)
+      if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir) {
 	dn = in->get_parent_dn();
+	if (dn->state_test(CDentry::STATE_PURGING)) {
+	  // set error flag in reply
+	  dout(7) << "dentry " << *dn << " is purging, flagging error ino" << dendl;
+	  reply->set_flag_error_ino();
+	  break;
+	}
+      }
     } else if (dis->get_want().depth() > 0) {
       // lookup dentry
       dn = curdir->lookup(dis->get_dentry(i), snapid);
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH 39/39] mds: clear scatter dirty if replica inode has no auth subtree
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (37 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 38/39] mds: don't replicate purging dentry Yan, Zheng
@ 2013-03-17 14:51 ` Yan, Zheng
  2013-03-21  3:49   ` Gregory Farnum
  2013-04-01  8:46 ` [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
  2013-04-01  8:51 ` [PATCH] mds: avoid sending duplicated table prepare/commit Yan, Zheng
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-17 14:51 UTC (permalink / raw)
  To: ceph-devel; +Cc: sage, greg, Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

This avoids sending superfluous scatterlock state to recovering MDS

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/CInode.cc   |  5 +++--
 src/mds/CInode.h    |  2 +-
 src/mds/MDCache.cc  | 13 ++++++-------
 src/mds/Migrator.cc | 15 +++++++++++++++
 4 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 42137f3..25cb6c1 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -615,12 +615,13 @@ void CInode::close_dirfrags()
     close_dirfrag(dirfrags.begin()->first);
 }
 
-bool CInode::has_subtree_root_dirfrag()
+bool CInode::has_subtree_root_dirfrag(int auth)
 {
   for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
        p != dirfrags.end();
        ++p)
-    if (p->second->is_subtree_root())
+    if (p->second->is_subtree_root() &&
+	(auth == -1 || p->second->dir_auth.first == auth))
       return true;
   return false;
 }
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index f7b8f33..bea7430 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -344,7 +344,7 @@ public:
   CDir *add_dirfrag(CDir *dir);
   void close_dirfrag(frag_t fg);
   void close_dirfrags();
-  bool has_subtree_root_dirfrag();
+  bool has_subtree_root_dirfrag(int auth=-1);
 
   void force_dirfrags();
   void verify_dirfrags();
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index d730ff1..75c7ded 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3330,8 +3330,10 @@ void MDCache::recalc_auth_bits()
   set<CInode*> subtree_inodes;
   for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
        p != subtrees.end();
-       ++p)
-    subtree_inodes.insert(p->first->inode);      
+       ++p) {
+    if (p->first->dir_auth.first == mds->get_nodeid())
+      subtree_inodes.insert(p->first->inode);
+  }
 
   for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
        p != subtrees.end();
@@ -3390,11 +3392,8 @@ void MDCache::recalc_auth_bits()
 	    if (dnl->get_inode()->is_dirty())
 	      dnl->get_inode()->mark_clean();
 	    // avoid touching scatterlocks for our subtree roots!
-	    if (subtree_inodes.count(dnl->get_inode()) == 0) {
-	      dnl->get_inode()->filelock.remove_dirty();
-	      dnl->get_inode()->nestlock.remove_dirty();
-	      dnl->get_inode()->dirfragtreelock.remove_dirty();
-	    }
+	    if (subtree_inodes.count(dnl->get_inode()) == 0)
+	      dnl->get_inode()->clear_scatter_dirty();
 	  }
 
 	  // recurse?
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 9cbad87..49d21ab 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -1095,6 +1095,10 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini
   
   in->clear_dirty_rstat();
 
+  // no more auth subtree? clear scatter dirty
+  if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
+    in->clear_scatter_dirty();
+
   in->item_open_file.remove_myself();
 
   // waiters
@@ -1534,6 +1538,11 @@ void Migrator::export_finish(CDir *dir)
   cache->adjust_subtree_auth(dir, export_peer[dir]);
   cache->try_subtree_merge(dir);  // NOTE: may journal subtree_map as sideeffect
 
+  // no more auth subtree? clear scatter dirty
+  if (!dir->get_inode()->is_auth() &&
+      !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid()))
+    dir->get_inode()->clear_scatter_dirty();
+
   // unpin path
   export_unlock(dir);
 
@@ -2020,6 +2029,10 @@ void Migrator::import_reverse(CDir *dir)
     cache->trim_non_auth_subtree(dir);
   cache->adjust_subtree_auth(dir, import_peer[dir->dirfrag()]);
 
+  if (!dir->get_inode()->is_auth() &&
+      !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid()))
+    dir->get_inode()->clear_scatter_dirty();
+
   // adjust auth bits.
   list<CDir*> q;
   q.push_back(dir);
@@ -2053,6 +2066,8 @@ void Migrator::import_reverse(CDir *dir)
 	if (in->is_dirty()) 
 	  in->mark_clean();
 	in->clear_dirty_rstat();
+	if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
+	  in->clear_scatter_dirty();
 
 	in->authlock.clear_gather();
 	in->linklock.clear_gather();
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-17 14:51 ` [PATCH 04/39] mds: make sure table request id unique Yan, Zheng
@ 2013-03-19 23:09   ` Greg Farnum
  2013-03-20  5:53     ` Yan, Zheng
  0 siblings, 1 reply; 117+ messages in thread
From: Greg Farnum @ 2013-03-19 23:09 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

Hmm, this is definitely narrowing the race (probably enough to never hit it), but it's not actually eliminating it (if the restart happens after 4 billion requests…). More importantly this kind of symptom makes me worry that we might be papering over more serious issues with colliding states in the Table on restart.
I don't have the MDSTable semantics in my head so I'll need to look into this later unless somebody else volunteers to do so…
-Greg

Software Engineer #42 @ http://inktank.com | http://ceph.com


On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>  
> When a MDS becomes active, the table server re-sends 'agree' messages
> for old prepared request. If the recoverd MDS starts a new table request
> at the same time, The new request's ID can happen to be the same as old
> prepared request's ID, because current table client assigns request ID
> from zero after MDS restarts.
>  
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> ---
> src/mds/MDS.cc (http://MDS.cc) | 3 +++
> src/mds/MDSTableClient.cc (http://MDSTableClient.cc) | 5 +++++
> src/mds/MDSTableClient.h | 2 ++
> 3 files changed, 10 insertions(+)
>  
> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
> index bb1c833..859782a 100644
> --- a/src/mds/MDS.cc (http://MDS.cc)
> +++ b/src/mds/MDS.cc (http://MDS.cc)
> @@ -1212,6 +1212,9 @@ void MDS::boot_start(int step, int r)
> dout(2) << "boot_start " << step << ": opening snap table" << dendl;  
> snapserver->load(gather.new_sub());
> }
> +
> + anchorclient->init();
> + snapclient->init();
>  
> dout(2) << "boot_start " << step << ": opening mds log" << dendl;
> mdlog->open(gather.new_sub());
> diff --git a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc) b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
> index ea021f5..beba0a3 100644
> --- a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
> +++ b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
> @@ -34,6 +34,11 @@
> #undef dout_prefix
> #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".tableclient(" << get_mdstable_name(table) << ") "
>  
> +void MDSTableClient::init()
> +{
> + // make reqid unique between MDS restarts
> + last_reqid = (uint64_t)mds->mdsmap->get_epoch() << 32;
> +}
>  
> void MDSTableClient::handle_request(class MMDSTableRequest *m)
> {
> diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
> index e15837f..78035db 100644
> --- a/src/mds/MDSTableClient.h
> +++ b/src/mds/MDSTableClient.h
> @@ -63,6 +63,8 @@ public:
> MDSTableClient(MDS *m, int tab) : mds(m), table(tab), last_reqid(0) {}
> virtual ~MDSTableClient() {}
>  
> + void init();
> +
> void handle_request(MMDSTableRequest *m);
>  
> void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, Context *onfinish);
> --  
> 1.7.11.7



--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-19 23:09   ` Greg Farnum
@ 2013-03-20  5:53     ` Yan, Zheng
  2013-03-20  6:15       ` Sage Weil
  0 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-20  5:53 UTC (permalink / raw)
  To: Greg Farnum; +Cc: ceph-devel, sage

On 03/20/2013 07:09 AM, Greg Farnum wrote:
> Hmm, this is definitely narrowing the race (probably enough to never hit it), but it's not actually eliminating it (if the restart happens after 4 billion requests…). More importantly this kind of symptom makes me worry that we might be papering over more serious issues with colliding states in the Table on restart.
> I don't have the MDSTable semantics in my head so I'll need to look into this later unless somebody else volunteers to do so…

Not just 4 billion requests, MDS restart has several stage, mdsmap epoch increases for each stage. I don't think there are any more colliding states in the table. The table client/server use two phase commit. it's similar to client request that involves multiple MDS. the reqid is analogy to client request id. The difference is client request ID is unique because new client always get an unique session id.

Thanks
Yan, Zheng

> -Greg
> 
> Software Engineer #42 @ http://inktank.com | http://ceph.com
> 
> 
> On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
> 
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>  
>> When a MDS becomes active, the table server re-sends 'agree' messages
>> for old prepared request. If the recoverd MDS starts a new table request
>> at the same time, The new request's ID can happen to be the same as old
>> prepared request's ID, because current table client assigns request ID
>> from zero after MDS restarts.
>>  
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>> ---
>> src/mds/MDS.cc (http://MDS.cc) | 3 +++
>> src/mds/MDSTableClient.cc (http://MDSTableClient.cc) | 5 +++++
>> src/mds/MDSTableClient.h | 2 ++
>> 3 files changed, 10 insertions(+)
>>  
>> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
>> index bb1c833..859782a 100644
>> --- a/src/mds/MDS.cc (http://MDS.cc)
>> +++ b/src/mds/MDS.cc (http://MDS.cc)
>> @@ -1212,6 +1212,9 @@ void MDS::boot_start(int step, int r)
>> dout(2) << "boot_start " << step << ": opening snap table" << dendl;  
>> snapserver->load(gather.new_sub());
>> }
>> +
>> + anchorclient->init();
>> + snapclient->init();
>>  
>> dout(2) << "boot_start " << step << ": opening mds log" << dendl;
>> mdlog->open(gather.new_sub());
>> diff --git a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc) b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
>> index ea021f5..beba0a3 100644
>> --- a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
>> +++ b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
>> @@ -34,6 +34,11 @@
>> #undef dout_prefix
>> #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".tableclient(" << get_mdstable_name(table) << ") "
>>  
>> +void MDSTableClient::init()
>> +{
>> + // make reqid unique between MDS restarts
>> + last_reqid = (uint64_t)mds->mdsmap->get_epoch() << 32;
>> +}
>>  
>> void MDSTableClient::handle_request(class MMDSTableRequest *m)
>> {
>> diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
>> index e15837f..78035db 100644
>> --- a/src/mds/MDSTableClient.h
>> +++ b/src/mds/MDSTableClient.h
>> @@ -63,6 +63,8 @@ public:
>> MDSTableClient(MDS *m, int tab) : mds(m), table(tab), last_reqid(0) {}
>> virtual ~MDSTableClient() {}
>>  
>> + void init();
>> +
>> void handle_request(MMDSTableRequest *m);
>>  
>> void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, Context *onfinish);
>> --  
>> 1.7.11.7
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-20  5:53     ` Yan, Zheng
@ 2013-03-20  6:15       ` Sage Weil
  2013-03-20  6:24         ` Yan, Zheng
  2013-03-20  6:49         ` Yan, Zheng
  0 siblings, 2 replies; 117+ messages in thread
From: Sage Weil @ 2013-03-20  6:15 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Greg Farnum, ceph-devel

On Wed, 20 Mar 2013, Yan, Zheng wrote:
> On 03/20/2013 07:09 AM, Greg Farnum wrote:
> > Hmm, this is definitely narrowing the race (probably enough to never hit it), but it's not actually eliminating it (if the restart happens after 4 billion requests?). More importantly this kind of symptom makes me worry that we might be papering over more serious issues with colliding states in the Table on restart.
> > I don't have the MDSTable semantics in my head so I'll need to look into this later unless somebody else volunteers to do so?
> 
> Not just 4 billion requests, MDS restart has several stage, mdsmap epoch 
> increases for each stage. I don't think there are any more colliding 
> states in the table. The table client/server use two phase commit. it's 
> similar to client request that involves multiple MDS. the reqid is 
> analogy to client request id. The difference is client request ID is 
> unique because new client always get an unique session id.

Each time a tid is consumed (at least for an update) it is journaled in 
the EMetaBlob::table_tids list, right?  So we could actually take a max 
from journal replay and pick up where we left off?  That seems like the 
cleanest.

I'm not too worried about 2^32 tids, I guess, but it would be nicer to 
avoid that possibility.

sage

> 
> Thanks
> Yan, Zheng
> 
> > -Greg
> > 
> > Software Engineer #42 @ http://inktank.com | http://ceph.com
> > 
> > 
> > On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
> > 
> >> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> >>  
> >> When a MDS becomes active, the table server re-sends 'agree' messages
> >> for old prepared request. If the recoverd MDS starts a new table request
> >> at the same time, The new request's ID can happen to be the same as old
> >> prepared request's ID, because current table client assigns request ID
> >> from zero after MDS restarts.
> >>  
> >> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> >> ---
> >> src/mds/MDS.cc (http://MDS.cc) | 3 +++
> >> src/mds/MDSTableClient.cc (http://MDSTableClient.cc) | 5 +++++
> >> src/mds/MDSTableClient.h | 2 ++
> >> 3 files changed, 10 insertions(+)
> >>  
> >> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
> >> index bb1c833..859782a 100644
> >> --- a/src/mds/MDS.cc (http://MDS.cc)
> >> +++ b/src/mds/MDS.cc (http://MDS.cc)
> >> @@ -1212,6 +1212,9 @@ void MDS::boot_start(int step, int r)
> >> dout(2) << "boot_start " << step << ": opening snap table" << dendl;  
> >> snapserver->load(gather.new_sub());
> >> }
> >> +
> >> + anchorclient->init();
> >> + snapclient->init();
> >>  
> >> dout(2) << "boot_start " << step << ": opening mds log" << dendl;
> >> mdlog->open(gather.new_sub());
> >> diff --git a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc) b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
> >> index ea021f5..beba0a3 100644
> >> --- a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
> >> +++ b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
> >> @@ -34,6 +34,11 @@
> >> #undef dout_prefix
> >> #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".tableclient(" << get_mdstable_name(table) << ") "
> >>  
> >> +void MDSTableClient::init()
> >> +{
> >> + // make reqid unique between MDS restarts
> >> + last_reqid = (uint64_t)mds->mdsmap->get_epoch() << 32;
> >> +}
> >>  
> >> void MDSTableClient::handle_request(class MMDSTableRequest *m)
> >> {
> >> diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
> >> index e15837f..78035db 100644
> >> --- a/src/mds/MDSTableClient.h
> >> +++ b/src/mds/MDSTableClient.h
> >> @@ -63,6 +63,8 @@ public:
> >> MDSTableClient(MDS *m, int tab) : mds(m), table(tab), last_reqid(0) {}
> >> virtual ~MDSTableClient() {}
> >>  
> >> + void init();
> >> +
> >> void handle_request(MMDSTableRequest *m);
> >>  
> >> void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, Context *onfinish);
> >> --  
> >> 1.7.11.7
> > 
> > 
> > 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-20  6:15       ` Sage Weil
@ 2013-03-20  6:24         ` Yan, Zheng
  2013-03-20  6:49         ` Yan, Zheng
  1 sibling, 0 replies; 117+ messages in thread
From: Yan, Zheng @ 2013-03-20  6:24 UTC (permalink / raw)
  To: Sage Weil; +Cc: Greg Farnum, ceph-devel

On 03/20/2013 02:15 PM, Sage Weil wrote:
> On Wed, 20 Mar 2013, Yan, Zheng wrote:
>> On 03/20/2013 07:09 AM, Greg Farnum wrote:
>>> Hmm, this is definitely narrowing the race (probably enough to never hit it), but it's not actually eliminating it (if the restart happens after 4 billion requests?). More importantly this kind of symptom makes me worry that we might be papering over more serious issues with colliding states in the Table on restart.
>>> I don't have the MDSTable semantics in my head so I'll need to look into this later unless somebody else volunteers to do so?
>>
>> Not just 4 billion requests, MDS restart has several stage, mdsmap epoch 
>> increases for each stage. I don't think there are any more colliding 
>> states in the table. The table client/server use two phase commit. it's 
>> similar to client request that involves multiple MDS. the reqid is 
>> analogy to client request id. The difference is client request ID is 
>> unique because new client always get an unique session id.
> 
> Each time a tid is consumed (at least for an update) it is journaled in 
> the EMetaBlob::table_tids list, right?  So we could actually take a max 
> from journal replay and pick up where we left off?  That seems like the 
> cleanest.

This approach works only if client journal the reqid before it sending the request
to the server. but current implementation is client journal the reqid when it receives
server's agree message.

Regards
Yan, Zheng 
> 
> I'm not too worried about 2^32 tids, I guess, but it would be nicer to 
> avoid that possibility.
> 
> sage
> 
>>
>> Thanks
>> Yan, Zheng
>>
>>> -Greg
>>>
>>> Software Engineer #42 @ http://inktank.com | http://ceph.com
>>>
>>>
>>> On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
>>>
>>>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>>>  
>>>> When a MDS becomes active, the table server re-sends 'agree' messages
>>>> for old prepared request. If the recoverd MDS starts a new table request
>>>> at the same time, The new request's ID can happen to be the same as old
>>>> prepared request's ID, because current table client assigns request ID
>>>> from zero after MDS restarts.
>>>>  
>>>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>>>> ---
>>>> src/mds/MDS.cc (http://MDS.cc) | 3 +++
>>>> src/mds/MDSTableClient.cc (http://MDSTableClient.cc) | 5 +++++
>>>> src/mds/MDSTableClient.h | 2 ++
>>>> 3 files changed, 10 insertions(+)
>>>>  
>>>> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
>>>> index bb1c833..859782a 100644
>>>> --- a/src/mds/MDS.cc (http://MDS.cc)
>>>> +++ b/src/mds/MDS.cc (http://MDS.cc)
>>>> @@ -1212,6 +1212,9 @@ void MDS::boot_start(int step, int r)
>>>> dout(2) << "boot_start " << step << ": opening snap table" << dendl;  
>>>> snapserver->load(gather.new_sub());
>>>> }
>>>> +
>>>> + anchorclient->init();
>>>> + snapclient->init();
>>>>  
>>>> dout(2) << "boot_start " << step << ": opening mds log" << dendl;
>>>> mdlog->open(gather.new_sub());
>>>> diff --git a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc) b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
>>>> index ea021f5..beba0a3 100644
>>>> --- a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
>>>> +++ b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
>>>> @@ -34,6 +34,11 @@
>>>> #undef dout_prefix
>>>> #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".tableclient(" << get_mdstable_name(table) << ") "
>>>>  
>>>> +void MDSTableClient::init()
>>>> +{
>>>> + // make reqid unique between MDS restarts
>>>> + last_reqid = (uint64_t)mds->mdsmap->get_epoch() << 32;
>>>> +}
>>>>  
>>>> void MDSTableClient::handle_request(class MMDSTableRequest *m)
>>>> {
>>>> diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
>>>> index e15837f..78035db 100644
>>>> --- a/src/mds/MDSTableClient.h
>>>> +++ b/src/mds/MDSTableClient.h
>>>> @@ -63,6 +63,8 @@ public:
>>>> MDSTableClient(MDS *m, int tab) : mds(m), table(tab), last_reqid(0) {}
>>>> virtual ~MDSTableClient() {}
>>>>  
>>>> + void init();
>>>> +
>>>> void handle_request(MMDSTableRequest *m);
>>>>  
>>>> void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, Context *onfinish);
>>>> --  
>>>> 1.7.11.7
>>>
>>>
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>


^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-20  6:15       ` Sage Weil
  2013-03-20  6:24         ` Yan, Zheng
@ 2013-03-20  6:49         ` Yan, Zheng
  2013-03-20 18:31           ` Greg Farnum
  1 sibling, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-20  6:49 UTC (permalink / raw)
  To: Sage Weil; +Cc: Greg Farnum, ceph-devel

On 03/20/2013 02:15 PM, Sage Weil wrote:
> On Wed, 20 Mar 2013, Yan, Zheng wrote:
>> On 03/20/2013 07:09 AM, Greg Farnum wrote:
>>> Hmm, this is definitely narrowing the race (probably enough to never hit it), but it's not actually eliminating it (if the restart happens after 4 billion requests?). More importantly this kind of symptom makes me worry that we might be papering over more serious issues with colliding states in the Table on restart.
>>> I don't have the MDSTable semantics in my head so I'll need to look into this later unless somebody else volunteers to do so?
>>
>> Not just 4 billion requests, MDS restart has several stage, mdsmap epoch 
>> increases for each stage. I don't think there are any more colliding 
>> states in the table. The table client/server use two phase commit. it's 
>> similar to client request that involves multiple MDS. the reqid is 
>> analogy to client request id. The difference is client request ID is 
>> unique because new client always get an unique session id.
> 
> Each time a tid is consumed (at least for an update) it is journaled in 
> the EMetaBlob::table_tids list, right?  So we could actually take a max 
> from journal replay and pick up where we left off?  That seems like the 
> cleanest.
> 
> I'm not too worried about 2^32 tids, I guess, but it would be nicer to 
> avoid that possibility.
> 

Can we re-use the client request ID as table client request ID ?

Regards
Yan, Zheng

> sage
> 
>>
>> Thanks
>> Yan, Zheng
>>
>>> -Greg
>>>
>>> Software Engineer #42 @ http://inktank.com | http://ceph.com
>>>
>>>
>>> On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
>>>
>>>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>>>  
>>>> When a MDS becomes active, the table server re-sends 'agree' messages
>>>> for old prepared request. If the recoverd MDS starts a new table request
>>>> at the same time, The new request's ID can happen to be the same as old
>>>> prepared request's ID, because current table client assigns request ID
>>>> from zero after MDS restarts.
>>>>  
>>>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>>>> ---
>>>> src/mds/MDS.cc (http://MDS.cc) | 3 +++
>>>> src/mds/MDSTableClient.cc (http://MDSTableClient.cc) | 5 +++++
>>>> src/mds/MDSTableClient.h | 2 ++
>>>> 3 files changed, 10 insertions(+)
>>>>  
>>>> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
>>>> index bb1c833..859782a 100644
>>>> --- a/src/mds/MDS.cc (http://MDS.cc)
>>>> +++ b/src/mds/MDS.cc (http://MDS.cc)
>>>> @@ -1212,6 +1212,9 @@ void MDS::boot_start(int step, int r)
>>>> dout(2) << "boot_start " << step << ": opening snap table" << dendl;  
>>>> snapserver->load(gather.new_sub());
>>>> }
>>>> +
>>>> + anchorclient->init();
>>>> + snapclient->init();
>>>>  
>>>> dout(2) << "boot_start " << step << ": opening mds log" << dendl;
>>>> mdlog->open(gather.new_sub());
>>>> diff --git a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc) b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
>>>> index ea021f5..beba0a3 100644
>>>> --- a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
>>>> +++ b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
>>>> @@ -34,6 +34,11 @@
>>>> #undef dout_prefix
>>>> #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".tableclient(" << get_mdstable_name(table) << ") "
>>>>  
>>>> +void MDSTableClient::init()
>>>> +{
>>>> + // make reqid unique between MDS restarts
>>>> + last_reqid = (uint64_t)mds->mdsmap->get_epoch() << 32;
>>>> +}
>>>>  
>>>> void MDSTableClient::handle_request(class MMDSTableRequest *m)
>>>> {
>>>> diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
>>>> index e15837f..78035db 100644
>>>> --- a/src/mds/MDSTableClient.h
>>>> +++ b/src/mds/MDSTableClient.h
>>>> @@ -63,6 +63,8 @@ public:
>>>> MDSTableClient(MDS *m, int tab) : mds(m), table(tab), last_reqid(0) {}
>>>> virtual ~MDSTableClient() {}
>>>>  
>>>> + void init();
>>>> +
>>>> void handle_request(MMDSTableRequest *m);
>>>>  
>>>> void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, Context *onfinish);
>>>> --  
>>>> 1.7.11.7
>>>
>>>
>>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>


^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-20  6:49         ` Yan, Zheng
@ 2013-03-20 18:31           ` Greg Farnum
  2013-03-21  8:07             ` Yan, Zheng
  0 siblings, 1 reply; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 18:31 UTC (permalink / raw)
  To: Yan, Zheng, Sage Weil; +Cc: ceph-devel

On Tuesday, March 19, 2013 at 11:49 PM, Yan, Zheng wrote:
> On 03/20/2013 02:15 PM, Sage Weil wrote:
> > On Wed, 20 Mar 2013, Yan, Zheng wrote:
> > > On 03/20/2013 07:09 AM, Greg Farnum wrote:
> > > > Hmm, this is definitely narrowing the race (probably enough to never hit it), but it's not actually eliminating it (if the restart happens after 4 billion requests?). More importantly this kind of symptom makes me worry that we might be papering over more serious issues with colliding states in the Table on restart.
> > > > I don't have the MDSTable semantics in my head so I'll need to look into this later unless somebody else volunteers to do so?
> > >  
> > >  
> > >  
> > > Not just 4 billion requests, MDS restart has several stage, mdsmap epoch  
> > > increases for each stage. I don't think there are any more colliding  
> > > states in the table. The table client/server use two phase commit. it's  
> > > similar to client request that involves multiple MDS. the reqid is  
> > > analogy to client request id. The difference is client request ID is  
> > > unique because new client always get an unique session id.
> >  
> >  
> >  
> > Each time a tid is consumed (at least for an update) it is journaled in  
> > the EMetaBlob::table_tids list, right? So we could actually take a max  
> > from journal replay and pick up where we left off? That seems like the  
> > cleanest.
> >  
> > I'm not too worried about 2^32 tids, I guess, but it would be nicer to  
> > avoid that possibility.
>  
>  
>  
> Can we re-use the client request ID as table client request ID ?
>  
> Regards
> Yan, Zheng

Not sure what you're referring to here — do you mean the ID of the filesystem client request which prompted the update? I don't think that would work as client requests actually require two parts to be unique (the client GUID and the request seq number), and I'm pretty sure a single client request can spawn multiple Table updates.

As I look over this more, it sure looks to me as if the effect of the code we have (when non-broken) is to rollback every non-committed request by an MDS which restarted — the only time it can handle the TableServer's "agree" with a different response is if the MDS was incorrectly marked out by the map. Am I parsing this correctly, Sage? Given that, and without having looked at the code more broadly, I think we want to add some sort of implicit or explicit handshake letting each of them know if the MDS actually disappeared. We use the process/address nonce to accomplish this in other places…
-Greg

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 01/39] mds: preserve subtree bounds until slave commit
  2013-03-17 14:51 ` [PATCH 01/39] mds: preserve subtree bounds until slave commit Yan, Zheng
@ 2013-03-20 18:33   ` Greg Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 18:33 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

Reviewed-by: Greg Farnum <greg@inktank.com> 

Software Engineer #42 @ http://inktank.com | http://ceph.com


On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> 
> When replaying an operation that rename a directory inode to non-auth subtree,
> if the inode has subtree bounds, we should prevent them from being trimmed
> until slave commit.
> 
> This patch also fixes a bug in ESlaveUpdate::replay(). EMetaBlob::replay()
> should be called before MDCache::finish_uncommitted_slave_update().
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> ---
> src/mds/MDCache.cc (http://MDCache.cc) | 21 +++++++++++----------
> src/mds/Mutation.h | 5 ++---
> src/mds/journal.cc (http://journal.cc) | 13 +++++++++----
> 3 files changed, 22 insertions(+), 17 deletions(-)
> 
> diff --git a/src/mds/MDCache.cc (http://MDCache.cc) b/src/mds/MDCache.cc (http://MDCache.cc)
> index fddcfc6..684e70b 100644
> --- a/src/mds/MDCache.cc (http://MDCache.cc)
> +++ b/src/mds/MDCache.cc (http://MDCache.cc)
> @@ -3016,10 +3016,10 @@ void MDCache::add_uncommitted_slave_update(metareqid_t reqid, int master, MDSlav
> {
> assert(uncommitted_slave_updates[master].count(reqid) == 0);
> uncommitted_slave_updates[master][reqid] = su;
> - if (su->rename_olddir)
> - uncommitted_slave_rename_olddir[su->rename_olddir]++;
> + for(set<CDir*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
> + uncommitted_slave_rename_olddir[*p]++;
> for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
> - uncommitted_slave_unlink[*p]++;
> + uncommitted_slave_unlink[*p]++;
> }
> 
> void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, int master)
> @@ -3031,11 +3031,12 @@ void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, int master)
> if (uncommitted_slave_updates[master].empty())
> uncommitted_slave_updates.erase(master);
> // discard the non-auth subtree we renamed out of
> - if (su->rename_olddir) {
> - uncommitted_slave_rename_olddir[su->rename_olddir]--;
> - if (uncommitted_slave_rename_olddir[su->rename_olddir] == 0) {
> - uncommitted_slave_rename_olddir.erase(su->rename_olddir);
> - CDir *root = get_subtree_root(su->rename_olddir);
> + for(set<CDir*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
> + CDir *dir = *p;
> + uncommitted_slave_rename_olddir[dir]--;
> + if (uncommitted_slave_rename_olddir[dir] == 0) {
> + uncommitted_slave_rename_olddir.erase(dir);
> + CDir *root = get_subtree_root(dir);
> if (root->get_dir_auth() == CDIR_AUTH_UNDEF)
> try_trim_non_auth_subtree(root);
> }
> @@ -6052,8 +6053,8 @@ bool MDCache::trim_non_auth_subtree(CDir *dir)
> {
> dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
> 
> - // preserve the dir for rollback
> - if (uncommitted_slave_rename_olddir.count(dir))
> + if (uncommitted_slave_rename_olddir.count(dir) || // preserve the dir for rollback
> + my_ambiguous_imports.count(dir->dirfrag()))
> return true;
> 
> bool keep_dir = false;
> diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
> index 55b84eb..5013f04 100644
> --- a/src/mds/Mutation.h
> +++ b/src/mds/Mutation.h
> @@ -315,13 +315,12 @@ struct MDSlaveUpdate {
> bufferlist rollback;
> elist<MDSlaveUpdate*>::item item;
> Context *waiter;
> - CDir* rename_olddir;
> + set<CDir*> olddirs;
> set<CInode*> unlinked;
> MDSlaveUpdate(int oo, bufferlist &rbl, elist<MDSlaveUpdate*> &list) :
> origop(oo),
> item(this),
> - waiter(0),
> - rename_olddir(0) {
> + waiter(0) {
> rollback.claim(rbl);
> list.push_back(&item);
> }
> diff --git a/src/mds/journal.cc (http://journal.cc) b/src/mds/journal.cc (http://journal.cc)
> index 5b3bd71..3375e40 100644
> --- a/src/mds/journal.cc (http://journal.cc)
> +++ b/src/mds/journal.cc (http://journal.cc)
> @@ -1131,10 +1131,15 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
> if (olddir) {
> if (olddir->authority() != CDIR_AUTH_UNDEF &&
> renamed_diri->authority() == CDIR_AUTH_UNDEF) {
> + assert(slaveup); // auth to non-auth, must be slave prepare
> list<frag_t> leaves;
> renamed_diri->dirfragtree.get_leaves(leaves);
> - for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p)
> - renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
> + for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) {
> + CDir *dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
> + // preserve subtree bound until slave commit
> + if (dir->authority() == CDIR_AUTH_UNDEF)
> + slaveup->olddirs.insert(dir);
> + }
> }
> 
> mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
> @@ -1143,7 +1148,7 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
> CDir *root = mds->mdcache->get_subtree_root(olddir);
> if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
> if (slaveup) // preserve the old dir until slave commit
> - slaveup->rename_olddir = olddir;
> + slaveup->olddirs.insert(olddir);
> else
> mds->mdcache->try_trim_non_auth_subtree(root);
> }
> @@ -2122,10 +2127,10 @@ void ESlaveUpdate::replay(MDS *mds)
> case ESlaveUpdate::OP_ROLLBACK:
> dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
> << ": applying rollback commit blob" << dendl;
> + commit.replay(mds, _segment);
> su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
> if (su)
> mds->mdcache->finish_uncommitted_slave_update(reqid, master);
> - commit.replay(mds, _segment);
> break;
> 
> default:
> -- 
> 1.7.11.7
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org (mailto:majordomo@vger.kernel.org)
> More majordomo info at http://vger.kernel.org/majordomo-info.html




^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 02/39] mds: process finished contexts in batch
  2013-03-17 14:51 ` [PATCH 02/39] mds: process finished contexts in batch Yan, Zheng
@ 2013-03-20 18:33   ` Greg Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 18:33 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

Reviewed-by: Greg Farnum <greg@inktank.com>


Software Engineer #42 @ http://inktank.com | http://ceph.com


On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> 
> If there are several unstable locks in an inode, current Locker::eval(CInode*,)
> processes each lock's finished contexts seperately. This may cause very deep
> call stack if finished contexts also call Locker::eval() on the same inode.
> An extreme example is:
> 
> Locker::eval() wakes an open request(). Server::handle_client_open() starts
> a log entry, then call Locker::issue_new_caps(). Locker::issue_new_caps()
> calls Locker::eval() and wakes another request. The later request also tries
> starting a log entry.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> ---
> src/mds/Locker.cc (http://Locker.cc) | 17 ++++++++++-------
> src/mds/Locker.h | 4 ++--
> 2 files changed, 12 insertions(+), 9 deletions(-)
> 
> diff --git a/src/mds/Locker.cc (http://Locker.cc) b/src/mds/Locker.cc (http://Locker.cc)
> index b61fb14..d06a9cc 100644
> --- a/src/mds/Locker.cc (http://Locker.cc)
> +++ b/src/mds/Locker.cc (http://Locker.cc)
> @@ -803,6 +803,7 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<C
> bool Locker::eval(CInode *in, int mask, bool caps_imported)
> {
> bool need_issue = false;
> + list<Context*> finishers;
> 
> dout(10) << "eval " << mask << " " << *in << dendl;
> 
> @@ -821,19 +822,19 @@ bool Locker::eval(CInode *in, int mask, bool caps_imported)
> 
> retry:
> if (mask & CEPH_LOCK_IFILE)
> - eval_any(&in->filelock, &need_issue, caps_imported);
> + eval_any(&in->filelock, &need_issue, &finishers, caps_imported);
> if (mask & CEPH_LOCK_IAUTH)
> - eval_any(&in->authlock, &need_issue, caps_imported);
> + eval_any(&in->authlock, &need_issue, &finishers, caps_imported);
> if (mask & CEPH_LOCK_ILINK)
> - eval_any(&in->linklock, &need_issue,caps_imported);
> + eval_any(&in->linklock, &need_issue, &finishers, caps_imported);
> if (mask & CEPH_LOCK_IXATTR)
> - eval_any(&in->xattrlock, &need_issue, caps_imported);
> + eval_any(&in->xattrlock, &need_issue, &finishers, caps_imported);
> if (mask & CEPH_LOCK_INEST)
> - eval_any(&in->nestlock, &need_issue, caps_imported);
> + eval_any(&in->nestlock, &need_issue, &finishers, caps_imported);
> if (mask & CEPH_LOCK_IFLOCK)
> - eval_any(&in->flocklock, &need_issue, caps_imported);
> + eval_any(&in->flocklock, &need_issue, &finishers, caps_imported);
> if (mask & CEPH_LOCK_IPOLICY)
> - eval_any(&in->policylock, &need_issue, caps_imported);
> + eval_any(&in->policylock, &need_issue, &finishers, caps_imported);
> 
> // drop loner?
> if (in->is_auth() && in->is_head() && in->get_wanted_loner() != in->get_loner()) {
> @@ -854,6 +855,8 @@ bool Locker::eval(CInode *in, int mask, bool caps_imported)
> }
> }
> 
> + finish_contexts(g_ceph_context, finishers);
> +
> if (need_issue && in->is_head())
> issue_caps(in);
> 
> diff --git a/src/mds/Locker.h b/src/mds/Locker.h
> index f005925..3f79996 100644
> --- a/src/mds/Locker.h
> +++ b/src/mds/Locker.h
> @@ -99,9 +99,9 @@ public:
> 
> void eval_gather(SimpleLock *lock, bool first=false, bool *need_issue=0, list<Context*> *pfinishers=0);
> void eval(SimpleLock *lock, bool *need_issue);
> - void eval_any(SimpleLock *lock, bool *need_issue, bool first=false) {
> + void eval_any(SimpleLock *lock, bool *need_issue, list<Context*> *pfinishers=0, bool first=false) {
> if (!lock->is_stable())
> - eval_gather(lock, first, need_issue);
> + eval_gather(lock, first, need_issue, pfinishers);
> else if (lock->get_parent()->is_auth())
> eval(lock, need_issue);
> }
> -- 
> 1.7.11.7




^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 03/39] mds: fix MDCache::adjust_bounded_subtree_auth()
  2013-03-17 14:51 ` [PATCH 03/39] mds: fix MDCache::adjust_bounded_subtree_auth() Yan, Zheng
@ 2013-03-20 18:33   ` Greg Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 18:33 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

Reviewed-by: Greg Farnum <greg@inktank.com>


Software Engineer #42 @ http://inktank.com | http://ceph.com


On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> 
> There are cases that need both create new bound and swallow intervening
> subtree. For example: A MDS exports subtree A with bound B and imports
> subtree B with bound C at the same time. The MDS crashes, exporting
> subtree A fails, but importing subtree B succeed. During recovery, the
> MDS may create new bound C and swallow subtree B.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> ---
> src/mds/MDCache.cc (http://MDCache.cc) | 10 ++++++++--
> 1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/src/mds/MDCache.cc (http://MDCache.cc) b/src/mds/MDCache.cc (http://MDCache.cc)
> index 684e70b..19dc60b 100644
> --- a/src/mds/MDCache.cc (http://MDCache.cc)
> +++ b/src/mds/MDCache.cc (http://MDCache.cc)
> @@ -980,15 +980,21 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, pair<in
> }
> else {
> dout(10) << " want bound " << *bound << dendl;
> + CDir *t = get_subtree_root(bound->get_parent_dir());
> + if (subtrees[t].count(bound) == 0) {
> + assert(t != dir);
> + dout(10) << " new bound " << *bound << dendl;
> + adjust_subtree_auth(bound, t->authority());
> + }
> // make sure it's nested beneath ambiguous subtree(s)
> while (1) {
> - CDir *t = get_subtree_root(bound->get_parent_dir());
> - if (t == dir) break;
> while (subtrees[dir].count(t) == 0)
> t = get_subtree_root(t->get_parent_dir());
> dout(10) << " swallowing intervening subtree at " << *t << dendl;
> adjust_subtree_auth(t, auth);
> try_subtree_merge_at(t);
> + t = get_subtree_root(bound->get_parent_dir());
> + if (t == dir) break;
> }
> }
> }
> -- 
> 1.7.11.7




^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 05/39] mds: send table request when peer is in proper state.
  2013-03-17 14:51 ` [PATCH 05/39] mds: send table request when peer is in proper state Yan, Zheng
@ 2013-03-20 18:34   ` Greg Farnum
  2013-03-29 21:58   ` Gregory Farnum
  1 sibling, 0 replies; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 18:34 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

This and patch 6 are probably going to get dealt with as part of our conversation on patch 4 and restart of the TableServers. 

Software Engineer #42 @ http://inktank.com | http://ceph.com


On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> 
> Table client/server should send request/reply when the peer is active.
> Anchor query is an exception, because MDS in rejoin stage may need
> fetch files before sending rejoin ack, the anchor server can also be
> in rejoin stage.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> ---
> src/mds/AnchorClient.cc (http://AnchorClient.cc) | 5 ++++-
> src/mds/MDSTableClient.cc (http://MDSTableClient.cc) | 9 ++++++---
> src/mds/MDSTableServer.cc (http://MDSTableServer.cc) | 3 ++-
> 3 files changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/src/mds/AnchorClient.cc (http://AnchorClient.cc) b/src/mds/AnchorClient.cc (http://AnchorClient.cc)
> index 455e97f..d7da9d1 100644
> --- a/src/mds/AnchorClient.cc (http://AnchorClient.cc)
> +++ b/src/mds/AnchorClient.cc (http://AnchorClient.cc)
> @@ -80,9 +80,12 @@ void AnchorClient::lookup(inodeno_t ino, vector<Anchor>& trace, Context *onfinis
> 
> void AnchorClient::_lookup(inodeno_t ino)
> {
> + int ts = mds->mdsmap->get_tableserver();
> + if (mds->mdsmap->get_state(ts) < MDSMap::STATE_REJOIN)
> + return;
> MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_QUERY, 0, 0);
> ::encode(ino, req->bl);
> - mds->send_message_mds(req, mds->mdsmap->get_tableserver());
> + mds->send_message_mds(req, ts);
> }
> 
> 
> diff --git a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc) b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
> index beba0a3..df0131f 100644
> --- a/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
> +++ b/src/mds/MDSTableClient.cc (http://MDSTableClient.cc)
> @@ -149,9 +149,10 @@ void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist
> void MDSTableClient::send_to_tableserver(MMDSTableRequest *req)
> {
> int ts = mds->mdsmap->get_tableserver();
> - if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY)
> + if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY) {
> mds->send_message_mds(req, ts);
> - else {
> + } else {
> + req->put();
> dout(10) << " deferring request to not-yet-active tableserver mds." << ts << dendl;
> }
> }
> @@ -193,7 +194,9 @@ void MDSTableClient::got_journaled_ack(version_t tid)
> void MDSTableClient::finish_recovery()
> {
> dout(7) << "finish_recovery" << dendl;
> - resend_commits();
> + int ts = mds->mdsmap->get_tableserver();
> + if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY)
> + resend_commits();
> }
> 
> void MDSTableClient::resend_commits()
> diff --git a/src/mds/MDSTableServer.cc (http://MDSTableServer.cc) b/src/mds/MDSTableServer.cc (http://MDSTableServer.cc)
> index 4f86ff1..07c7d26 100644
> --- a/src/mds/MDSTableServer.cc (http://MDSTableServer.cc)
> +++ b/src/mds/MDSTableServer.cc (http://MDSTableServer.cc)
> @@ -159,7 +159,8 @@ void MDSTableServer::handle_mds_recovery(int who)
> for (map<version_t,mds_table_pending_t>::iterator p = pending_for_mds.begin();
> p != pending_for_mds.end();
> ++p) {
> - if (who >= 0 && p->second.mds != who)
> + if ((who >= 0 && p->second.mds != who) ||
> + mds->mdsmap->get_state(p->second.mds) < MDSMap::STATE_CLIENTREPLAY)
> continue;
> MMDSTableRequest *reply = new MMDSTableRequest(table, TABLESERVER_OP_AGREE, p->second.reqid, p->second.tid);
> mds->send_message_mds(reply, p->second.mds);
> -- 
> 1.7.11.7




^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 07/39] mds: mark connection down when MDS fails
  2013-03-17 14:51 ` [PATCH 07/39] mds: mark connection down when MDS fails Yan, Zheng
@ 2013-03-20 18:37   ` Greg Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 18:37 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

Reviewed-by: Greg Farnum <greg@inktank.com>



Software Engineer #42 @ http://inktank.com | http://ceph.com


On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> 
> So if the MDS restarts and uses the same address, it does not get
> old messages.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> ---
> src/mds/MDS.cc (http://MDS.cc) | 8 ++++++--
> 1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
> index 859782a..282fa64 100644
> --- a/src/mds/MDS.cc (http://MDS.cc)
> +++ b/src/mds/MDS.cc (http://MDS.cc)
> @@ -1046,8 +1046,10 @@ void MDS::handle_mds_map(MMDSMap *m)
> oldmap->get_failed_mds_set(oldfailed);
> mdsmap->get_failed_mds_set(failed);
> for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p)
> - if (oldfailed.count(*p) == 0)
> + if (oldfailed.count(*p) == 0) {
> + messenger->mark_down(oldmap->get_inst(*p).addr);
> mdcache->handle_mds_failure(*p);
> + }
> 
> // or down then up?
> // did their addr/inst change?
> @@ -1055,8 +1057,10 @@ void MDS::handle_mds_map(MMDSMap *m)
> mdsmap->get_up_mds_set(up);
> for (set<int>::iterator p = up.begin(); p != up.end(); ++p) 
> if (oldmap->have_inst(*p) &&
> - oldmap->get_inst(*p) != mdsmap->get_inst(*p))
> + oldmap->get_inst(*p) != mdsmap->get_inst(*p)) {
> + messenger->mark_down(oldmap->get_inst(*p).addr);
> mdcache->handle_mds_failure(*p);
> + }
> }
> if (is_clientreplay() || is_active() || is_stopping()) {
> // did anyone stop?
> -- 
> 1.7.11.7




^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state.
  2013-03-17 14:51 ` [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state Yan, Zheng
@ 2013-03-20 18:40   ` Greg Farnum
  2013-03-21  2:22     ` Yan, Zheng
  2013-03-20 19:09   ` Greg Farnum
  1 sibling, 1 reply; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 18:40 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

The idea of this patch makes sense, but I'm not sure if we guarantee that each daemon sees every map update — if they don't then if an MDS misses the map moving an MDS into CLIENTREPLAY then they won't process them as having recovered on the next map. Sage or Joao, what are the guarantees subscription provides?  
-Greg

Software Engineer #42 @ http://inktank.com | http://ceph.com


On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>  
> MDS in clientreply state already start servering requests. It also
> make MDS::handle_mds_recovery() and MDS::recovery_done() match.
>  
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> ---
> src/mds/MDS.cc (http://MDS.cc) | 2 ++
> 1 file changed, 2 insertions(+)
>  
> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
> index 282fa64..b91dcbd 100644
> --- a/src/mds/MDS.cc (http://MDS.cc)
> +++ b/src/mds/MDS.cc (http://MDS.cc)
> @@ -1032,7 +1032,9 @@ void MDS::handle_mds_map(MMDSMap *m)
>  
> set<int> oldactive, active;
> oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
> + oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
> mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
> + mdsmap->get_mds_set(active, MDSMap::STATE_CLIENTREPLAY);
> for (set<int>::iterator p = active.begin(); p != active.end(); ++p)  
> if (*p != whoami && // not me
> oldactive.count(*p) == 0) // newly so?
> --  
> 1.7.11.7



--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state.
  2013-03-17 14:51 ` [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state Yan, Zheng
  2013-03-20 18:40   ` Greg Farnum
@ 2013-03-20 19:09   ` Greg Farnum
  1 sibling, 0 replies; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 19:09 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

Oh, also: s/clientreply/clientreplay in the commit message 

Software Engineer #42 @ http://inktank.com | http://ceph.com


On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> 
> MDS in clientreply state already start servering requests. It also
> make MDS::handle_mds_recovery() and MDS::recovery_done() match.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> ---
> src/mds/MDS.cc (http://MDS.cc) | 2 ++
> 1 file changed, 2 insertions(+)
> 
> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
> index 282fa64..b91dcbd 100644
> --- a/src/mds/MDS.cc (http://MDS.cc)
> +++ b/src/mds/MDS.cc (http://MDS.cc)
> @@ -1032,7 +1032,9 @@ void MDS::handle_mds_map(MMDSMap *m)
> 
> set<int> oldactive, active;
> oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
> + oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
> mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
> + mdsmap->get_mds_set(active, MDSMap::STATE_CLIENTREPLAY);
> for (set<int>::iterator p = active.begin(); p != active.end(); ++p) 
> if (*p != whoami && // not me
> oldactive.count(*p) == 0) // newly so?
> -- 
> 1.7.11.7




^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 09/39] mds: defer eval gather locks when removing replica
  2013-03-17 14:51 ` [PATCH 09/39] mds: defer eval gather locks when removing replica Yan, Zheng
@ 2013-03-20 19:36   ` Greg Farnum
  2013-03-21  2:29     ` Yan, Zheng
  0 siblings, 1 reply; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 19:36 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> 
> Locks' states should not change between composing the cache rejoin ack
> messages and sending the message. If Locker::eval_gather() is called
> in MDCache::{inode,dentry}_remove_replica(), it may wake requests and
> change locks' states.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
> ---
> src/mds/MDCache.cc (http://MDCache.cc) | 51 ++++++++++++++++++++++++++++++---------------------
> src/mds/MDCache.h | 8 +++++---
> 2 files changed, 35 insertions(+), 24 deletions(-)
> 
> diff --git a/src/mds/MDCache.cc (http://MDCache.cc) b/src/mds/MDCache.cc (http://MDCache.cc)
> index 19dc60b..0f6b842 100644
> --- a/src/mds/MDCache.cc (http://MDCache.cc)
> +++ b/src/mds/MDCache.cc (http://MDCache.cc)
> @@ -3729,6 +3729,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
> // possible response(s)
> MMDSCacheRejoin *ack = 0; // if survivor
> set<vinodeno_t> acked_inodes; // if survivor
> + set<SimpleLock *> gather_locks; // if survivor
> bool survivor = false; // am i a survivor?
> 
> if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
> @@ -3851,7 +3852,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
> assert(dnl->is_primary());
> 
> if (survivor && dn->is_replica(from)) 
> - dentry_remove_replica(dn, from); // this induces a lock gather completion
> + dentry_remove_replica(dn, from, gather_locks); // this induces a lock gather completion

This comment is no longer accurate :) 
> int dnonce = dn->add_replica(from);
> dout(10) << " have " << *dn << dendl;
> if (ack) 
> @@ -3864,7 +3865,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
> assert(in);
> 
> if (survivor && in->is_replica(from)) 
> - inode_remove_replica(in, from);
> + inode_remove_replica(in, from, gather_locks);
> int inonce = in->add_replica(from);
> dout(10) << " have " << *in << dendl;
> 
> @@ -3887,7 +3888,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
> CInode *in = get_inode(*p);
> assert(in); // hmm fixme wrt stray?
> if (survivor && in->is_replica(from)) 
> - inode_remove_replica(in, from); // this induces a lock gather completion
> + inode_remove_replica(in, from, gather_locks); // this induces a lock gather completion

Same here. 

Other than those, looks good.
-Greg
Software Engineer #42 @ http://inktank.com | http://ceph.com


> int inonce = in->add_replica(from);
> dout(10) << " have base " << *in << dendl;
> 
> @@ -3909,8 +3910,11 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
> ack->add_inode_base(in);
> }
> 
> - rejoin_scour_survivor_replicas(from, ack, acked_inodes);
> + rejoin_scour_survivor_replicas(from, ack, gather_locks, acked_inodes);
> mds->send_message(ack, weak->get_connection());
> +
> + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p)
> + mds->locker->eval_gather(*p);
> } else {
> // done?
> assert(rejoin_gather.count(from));
> @@ -4055,7 +4059,9 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
> * all validated replicas are acked with a strong nonce, etc. if that isn't in the
> * ack, the replica dne, and we can remove it from our replica maps.
> */
> -void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes)
> +void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
> + set<SimpleLock *>& gather_locks,
> + set<vinodeno_t>& acked_inodes)
> {
> dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
> 
> @@ -4070,7 +4076,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set
> if (in->is_auth() &&
> in->is_replica(from) &&
> acked_inodes.count(p->second->vino()) == 0) {
> - inode_remove_replica(in, from);
> + inode_remove_replica(in, from, gather_locks);
> dout(10) << " rem " << *in << dendl;
> }
> 
> @@ -4099,7 +4105,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set
> if (dn->is_replica(from) &&
> (ack->strong_dentries.count(dir->dirfrag()) == 0 ||
> ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
> - dentry_remove_replica(dn, from);
> + dentry_remove_replica(dn, from, gather_locks);
> dout(10) << " rem " << *dn << dendl;
> }
> }
> @@ -6189,6 +6195,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
> return;
> }
> 
> + set<SimpleLock *> gather_locks;
> // loop over realms
> for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
> p != m->realms.end();
> @@ -6255,7 +6262,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
> // remove from our cached_by
> dout(7) << " inode expire on " << *in << " from mds." << from 
> << " cached_by was " << in->get_replicas() << dendl;
> - inode_remove_replica(in, from);
> + inode_remove_replica(in, from, gather_locks);
> } 
> else {
> // this is an old nonce, ignore expire.
> @@ -6332,7 +6339,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
> 
> if (nonce == dn->get_replica_nonce(from)) {
> dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
> - dentry_remove_replica(dn, from);
> + dentry_remove_replica(dn, from, gather_locks);
> } 
> else {
> dout(7) << " dentry_expire on " << *dn << " from mds." << from
> @@ -6343,6 +6350,8 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
> }
> }
> 
> + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p)
> + mds->locker->eval_gather(*p);
> 
> // done
> m->put();
> @@ -6368,35 +6377,35 @@ void MDCache::discard_delayed_expire(CDir *dir)
> delayed_expire.erase(dir); 
> }
> 
> -void MDCache::inode_remove_replica(CInode *in, int from)
> +void MDCache::inode_remove_replica(CInode *in, int from, set<SimpleLock *>& gather_locks)
> {
> in->remove_replica(from);
> in->mds_caps_wanted.erase(from);
> 
> // note: this code calls _eval more often than it needs to!
> // fix lock
> - if (in->authlock.remove_replica(from)) mds->locker->eval_gather(&in->authlock);
> - if (in->linklock.remove_replica(from)) mds->locker->eval_gather(&in->linklock);
> - if (in->dirfragtreelock.remove_replica(from)) mds->locker->eval_gather(&in->dirfragtreelock);
> - if (in->filelock.remove_replica(from)) mds->locker->eval_gather(&in->filelock);
> - if (in->snaplock.remove_replica(from)) mds->locker->eval_gather(&in->snaplock);
> - if (in->xattrlock.remove_replica(from)) mds->locker->eval_gather(&in->xattrlock);
> + if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
> + if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
> + if (in->dirfragtreelock.remove_replica(from)) gather_locks.insert(&in->dirfragtreelock);
> + if (in->filelock.remove_replica(from)) gather_locks.insert(&in->filelock);
> + if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
> + if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
> 
> - if (in->nestlock.remove_replica(from)) mds->locker->eval_gather(&in->nestlock);
> - if (in->flocklock.remove_replica(from)) mds->locker->eval_gather(&in->flocklock);
> - if (in->policylock.remove_replica(from)) mds->locker->eval_gather(&in->policylock);
> + if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock);
> + if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
> + if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
> 
> // trim?
> maybe_eval_stray(in);
> }
> 
> -void MDCache::dentry_remove_replica(CDentry *dn, int from)
> +void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& gather_locks)
> {
> dn->remove_replica(from);
> 
> // fix lock
> if (dn->lock.remove_replica(from))
> - mds->locker->eval_gather(&dn->lock);
> + gather_locks.insert(&dn->lock);
> 
> CDentry::linkage_t *dnl = dn->get_projected_linkage();
> if (dnl->is_primary())
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index f07ea74..a9f05c6 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -406,7 +406,9 @@ protected:
> CDir* rejoin_invent_dirfrag(dirfrag_t df);
> bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m);
> void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
> - void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes);
> + void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
> + set<SimpleLock *>& gather_locks,
> + set<vinodeno_t>& acked_inodes);
> void handle_cache_rejoin_ack(MMDSCacheRejoin *m);
> void handle_cache_rejoin_purge(MMDSCacheRejoin *m);
> void handle_cache_rejoin_missing(MMDSCacheRejoin *m);
> @@ -607,8 +609,8 @@ public:
> }
> protected:
> 
> - void inode_remove_replica(CInode *in, int rep);
> - void dentry_remove_replica(CDentry *dn, int rep);
> + void inode_remove_replica(CInode *in, int rep, set<SimpleLock *>& gather_locks);
> + void dentry_remove_replica(CDentry *dn, int rep, set<SimpleLock *>& gather_locks);
> 
> void rename_file(CDentry *srcdn, CDentry *destdn);
> 
> -- 
> 1.7.11.7



^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 11/39] mds: don't delay processing replica buffer in slave request
  2013-03-17 14:51 ` [PATCH 11/39] mds: don't delay processing replica buffer in slave request Yan, Zheng
@ 2013-03-20 21:19   ` Greg Farnum
  2013-03-21  2:38     ` Yan, Zheng
  0 siblings, 1 reply; 117+ messages in thread
From: Greg Farnum @ 2013-03-20 21:19 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, sage

On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> 
> Replicated objects need to be added into the cache immediately
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Why do we need to add them right away? Shouldn't we have a journaled replica if we need it?
-Greg

Software Engineer #42 @ http://inktank.com | http://ceph.com
> ---
> src/mds/MDCache.cc | 12 ++++++++++++
> src/mds/MDCache.h | 2 +-
> src/mds/MDS.cc | 6 +++---
> src/mds/Server.cc | 55 +++++++++++++++++++++++++++++++++++++++---------------
> 4 files changed, 56 insertions(+), 19 deletions(-)
> 
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 0f6b842..b668842 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -7722,6 +7722,18 @@ void MDCache::_find_ino_dir(inodeno_t ino, Context *fin, bufferlist& bl, int r)
> 
> /* ---------------------------- */
> 
> +int MDCache::get_num_client_requests()
> +{
> + int count = 0;
> + for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
> + p != active_requests.end();
> + ++p) {
> + if (p->second->reqid.name.is_client() && !p->second->is_slave())
> + count++;
> + }
> + return count;
> +}
> +
> /* This function takes over the reference to the passed Message */
> MDRequest *MDCache::request_start(MClientRequest *req)
> {
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index a9f05c6..4634121 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -240,7 +240,7 @@ protected:
> hash_map<metareqid_t, MDRequest*> active_requests; 
> 
> public:
> - int get_num_active_requests() { return active_requests.size(); }
> + int get_num_client_requests();
> 
> MDRequest* request_start(MClientRequest *req);
> MDRequest* request_start_slave(metareqid_t rid, __u32 attempt, int by);
> diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
> index b91dcbd..e99eecc 100644
> --- a/src/mds/MDS.cc
> +++ b/src/mds/MDS.cc
> @@ -1900,9 +1900,9 @@ bool MDS::_dispatch(Message *m)
> mdcache->is_open() &&
> replay_queue.empty() &&
> want_state == MDSMap::STATE_CLIENTREPLAY) {
> - dout(10) << " still have " << mdcache->get_num_active_requests()
> - << " active replay requests" << dendl;
> - if (mdcache->get_num_active_requests() == 0)
> + int num_requests = mdcache->get_num_client_requests();
> + dout(10) << " still have " << num_requests << " active replay requests" << dendl;
> + if (num_requests == 0)
> clientreplay_done();
> }
> 
> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
> index 4c4c86b..8e89e4c 100644
> --- a/src/mds/Server.cc
> +++ b/src/mds/Server.cc
> @@ -107,10 +107,8 @@ void Server::dispatch(Message *m)
> (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
> (static_cast<MClientRequest*>(m))->is_replay()))) {
> // replaying!
> - } else if (mds->is_clientreplay() && m->get_type() == MSG_MDS_SLAVE_REQUEST &&
> - ((static_cast<MMDSSlaveRequest*>(m))->is_reply() ||
> - !mds->mdsmap->is_active(m->get_source().num()))) {
> - // slave reply or the master is also in the clientreplay stage
> + } else if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
> + // handle_slave_request() will wait if necessary
> } else {
> dout(3) << "not active yet, waiting" << dendl;
> mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
> @@ -1291,6 +1289,13 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
> if (m->is_reply())
> return handle_slave_request_reply(m);
> 
> + CDentry *straydn = NULL;
> + if (m->stray.length() > 0) {
> + straydn = mdcache->add_replica_stray(m->stray, from);
> + assert(straydn);
> + m->stray.clear();
> + }
> +
> // am i a new slave?
> MDRequest *mdr = NULL;
> if (mdcache->have_request(m->get_reqid())) {
> @@ -1326,9 +1331,26 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
> m->put();
> return;
> }
> - mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m->get_source().num());
> + mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), from);
> }
> assert(mdr->slave_request == 0); // only one at a time, please! 
> +
> + if (straydn) {
> + mdr->pin(straydn);
> + mdr->straydn = straydn;
> + }
> +
> + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
> + dout(3) << "not clientreplay|active yet, waiting" << dendl;
> + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
> + return;
> + } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
> + mdr->locks.empty()) {
> + dout(3) << "not active yet, waiting" << dendl;
> + mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
> + return;
> + }
> +
> mdr->slave_request = m;
> 
> dispatch_slave_request(mdr);
> @@ -1339,6 +1361,12 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
> {
> int from = m->get_source().num();
> 
> + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
> + dout(3) << "not clientreplay|active yet, waiting" << dendl;
> + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
> + return;
> + }
> +
> if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
> metareqid_t r = m->get_reqid();
> mds->mdcache->committed_master_slave(r, from);
> @@ -5138,10 +5166,8 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr)
> dout(10) << " dn " << *dn << dendl;
> mdr->pin(dn);
> 
> - assert(mdr->slave_request->stray.length() > 0);
> - CDentry *straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
> - assert(straydn);
> - mdr->pin(straydn);
> + assert(mdr->straydn);
> + CDentry *straydn = mdr->straydn;
> dout(10) << " straydn " << *straydn << dendl;
> 
> mdr->now = mdr->slave_request->now;
> @@ -5208,6 +5234,7 @@ void Server::_logged_slave_rmdir(MDRequest *mdr, CDentry *dn, CDentry *straydn)
> // done.
> mdr->slave_request->put();
> mdr->slave_request = 0;
> + mdr->straydn = 0;
> }
> 
> void Server::handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
> @@ -6460,15 +6487,12 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
> // stray?
> bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
> (srcdnl->is_primary() || destdnl->is_primary()));
> - CDentry *straydn = 0;
> - if (destdnl->is_primary() && !linkmerge) {
> - assert(mdr->slave_request->stray.length() > 0);
> - straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
> + CDentry *straydn = mdr->straydn;
> + if (destdnl->is_primary() && !linkmerge)
> assert(straydn);
> - mdr->pin(straydn);
> - }
> 
> mdr->now = mdr->slave_request->now;
> + mdr->more()->srcdn_auth_mds = srcdn->authority().first;
> 
> // set up commit waiter (early, to clean up any freezing etc we do)
> if (!mdr->more()->slave_commit)
> @@ -6651,6 +6675,7 @@ void Server::_logged_slave_rename(MDRequest *mdr,
> // done.
> mdr->slave_request->put();
> mdr->slave_request = 0;
> + mdr->straydn = 0;
> }
> 
> void Server::_commit_slave_rename(MDRequest *mdr, int r,
> -- 
> 1.7.11.7




^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 12/39] mds: compose and send resolve messages in batch
  2013-03-17 14:51 ` [PATCH 12/39] mds: compose and send resolve messages in batch Yan, Zheng
@ 2013-03-20 21:45   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 21:45 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

Software Engineer #42 @ http://inktank.com | http://ceph.com

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Resolve messages for all MDS are the same, so we can compose and
> send them in batch.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 181 +++++++++++++++++++++++++----------------------------
>  src/mds/MDCache.h  |  11 ++--
>  2 files changed, 93 insertions(+), 99 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index b668842..c455a20 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -2432,10 +2432,6 @@ void MDCache::resolve_start()
>      if (rootdir)
>        adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
>    }
> -
> -  for (map<int, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
> -       p != uncommitted_slave_updates.end(); ++p)
> -    need_resolve_ack.insert(p->first);
>  }
>
>  void MDCache::send_resolves()
> @@ -2444,9 +2440,10 @@ void MDCache::send_resolves()
>    got_resolve.clear();
>    other_ambiguous_imports.clear();
>
> -  if (!need_resolve_ack.empty()) {
> -    for (set<int>::iterator p = need_resolve_ack.begin(); p != need_resolve_ack.end(); ++p)
> -      send_slave_resolve(*p);
> +  send_slave_resolves();
> +  if (!resolve_ack_gather.empty()) {
> +    dout(10) << "send_resolves still waiting for resolve ack from ("
> +             << need_resolve_ack << ")" << dendl;
>      return;
>    }
>    if (!need_resolve_rollback.empty()) {
> @@ -2454,95 +2451,74 @@ void MDCache::send_resolves()
>              << need_resolve_rollback << ")" << dendl;
>      return;
>    }
> -  assert(uncommitted_slave_updates.empty());
> -  for (set<int>::iterator p = recovery_set.begin(); p != recovery_set.end(); ++p) {
> -    int who = *p;
> -    if (who == mds->whoami)
> -      continue;
> -    if (migrator->is_importing() ||
> -       migrator->is_exporting())
> -      send_resolve_later(who);
> -    else
> -      send_resolve_now(who);
> -  }
> -}
> -
> -void MDCache::send_resolve_later(int who)
> -{
> -  dout(10) << "send_resolve_later to mds." << who << dendl;
> -  wants_resolve.insert(who);
> +  send_subtree_resolves();
>  }
>
> -void MDCache::maybe_send_pending_resolves()
> +void MDCache::send_slave_resolves()
>  {
> -  if (wants_resolve.empty())
> -    return;  // nothing to send.
> -
> -  // only if it's appropriate!
> -  if (migrator->is_exporting() ||
> -      migrator->is_importing()) {
> -    dout(7) << "maybe_send_pending_resolves waiting, imports/exports still in progress" << dendl;
> -    migrator->show_importing();
> -    migrator->show_exporting();
> -    return;  // not now
> -  }
> -
> -  // ok, send them.
> -  for (set<int>::iterator p = wants_resolve.begin();
> -       p != wants_resolve.end();
> -       ++p)
> -    send_resolve_now(*p);
> -  wants_resolve.clear();
> -}
> +  dout(10) << "send_slave_resolves" << dendl;
>
> +  map<int, MMDSResolve*> resolves;
>
> -class C_MDC_SendResolve : public Context {
> -  MDCache *mdc;
> -  int who;
> -public:
> -  C_MDC_SendResolve(MDCache *c, int w) : mdc(c), who(w) { }
> -  void finish(int r) {
> -    mdc->send_resolve_now(who);
> -  }
> -};
> -
> -void MDCache::send_slave_resolve(int who)
> -{
> -  dout(10) << "send_slave_resolve to mds." << who << dendl;
> -  MMDSResolve *m = new MMDSResolve;
> -
> -  // list prepare requests lacking a commit
> -  // [active survivor]
> -  for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
> -      p != active_requests.end();
> -      ++p) {
> -    if (p->second->is_slave() && p->second->slave_to_mds == who) {
> -      dout(10) << " including uncommitted " << *p->second << dendl;
> -      m->add_slave_request(p->first);
> +  if (mds->is_resolve()) {
> +    for (map<int, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
> +        p != uncommitted_slave_updates.end();
> +        ++p) {
> +      resolves[p->first] = new MMDSResolve;
> +      for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
> +          q != p->second.end();
> +          ++q) {
> +       dout(10) << " including uncommitted " << q->first << dendl;
> +       resolves[p->first]->add_slave_request(q->first);
> +      }
>      }
> -  }
> -  // [resolving]
> -  if (uncommitted_slave_updates.count(who) &&
> -      !uncommitted_slave_updates[who].empty()) {
> -    for (map<metareqid_t, MDSlaveUpdate*>::iterator p = uncommitted_slave_updates[who].begin();
> -       p != uncommitted_slave_updates[who].end();
> -       ++p) {
> -      dout(10) << " including uncommitted " << p->first << dendl;
> -      m->add_slave_request(p->first);
> +  } else {
> +    set<int> resolve_set;
> +    mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
> +    for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
> +        p != active_requests.end();
> +        ++p) {
> +      if (!p->second->is_slave() || !p->second->slave_did_prepare())
> +       continue;
> +      int master = p->second->slave_to_mds;
> +      if (resolve_set.count(master)) {
> +       dout(10) << " including uncommitted " << *p->second << dendl;
> +       if (!resolves.count(master))
> +         resolves[master] = new MMDSResolve;
> +       resolves[master]->add_slave_request(p->first);
> +      }
>      }
>    }
>
> -  assert(!m->slave_requests.empty());
> -  dout(10) << " will need resolve ack from mds." << who << dendl;
> -  mds->send_message_mds(m, who);
> +  for (map<int, MMDSResolve*>::iterator p = resolves.begin();
> +       p != resolves.end();
> +       ++p) {
> +    dout(10) << "sending slave resolve to mds." << p->first << dendl;
> +    mds->send_message_mds(p->second, p->first);
> +    need_resolve_ack.insert(p->first);
> +  }
>  }
>
> -void MDCache::send_resolve_now(int who)
> +void MDCache::send_subtree_resolves()
>  {
> -  dout(10) << "send_resolve_now to mds." << who << dendl;
> -  MMDSResolve *m = new MMDSResolve;
> +  dout(10) << "send_subtree_resolves" << dendl;
>
> -  show_subtrees();
> +  if (migrator->is_exporting() || migrator->is_importing()) {
> +    dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
> +    migrator->show_importing();
> +    migrator->show_exporting();
> +    resolves_pending = true;
> +    return;  // not now
> +  }
> +
> +  map<int, MMDSResolve*> resolves;
> +  for (set<int>::iterator p = recovery_set.begin();
> +       p != recovery_set.end();
> +       ++p) {
> +    if (*p == mds->whoami)
> +      continue;
> +    resolves[*p] = new MMDSResolve;
> +  }
>
>    // known
>    for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
> @@ -2562,22 +2538,30 @@ void MDCache::send_resolve_now(int who)
>        set<CDir*> bounds;
>        get_subtree_bounds(dir, bounds);
>        vector<dirfrag_t> dfls;
> -      for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
> -       dfls.push_back((*p)->dirfrag());
> -      m->add_ambiguous_import(dir->dirfrag(), dfls);
> +      for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
> +       dfls.push_back((*q)->dirfrag());
> +      for (map<int, MMDSResolve*>::iterator q = resolves.begin();
> +          q != resolves.end();
> +          ++q)
> +       resolves[q->first]->add_ambiguous_import(dir->dirfrag(), dfls);
>        dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
>      } else {
>        // not ambiguous.
> -      m->add_subtree(dir->dirfrag());
> -
> +      for (map<int, MMDSResolve*>::iterator q = resolves.begin();
> +          q != resolves.end();
> +          ++q)
> +       resolves[q->first]->add_subtree(dir->dirfrag());
>        // bounds too
>        vector<dirfrag_t> dfls;
>        for (set<CDir*>::iterator q = subtrees[dir].begin();
>            q != subtrees[dir].end();
>            ++q) {
>         CDir *bound = *q;
> -       m->add_subtree_bound(dir->dirfrag(), bound->dirfrag());
>         dfls.push_back(bound->dirfrag());
> +       for (map<int, MMDSResolve*>::iterator r = resolves.begin();
> +            r != resolves.end();
> +            ++r)
> +         resolves[r->first]->add_subtree_bound(dir->dirfrag(), bound->dirfrag());
>        }
>        dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
>      }
> @@ -2587,15 +2571,23 @@ void MDCache::send_resolve_now(int who)
>    for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
>         p != my_ambiguous_imports.end();
>         ++p) {
> -    m->add_ambiguous_import(p->first, p->second);
> +    for (map<int, MMDSResolve*>::iterator q = resolves.begin();
> +        q != resolves.end();
> +        ++q)
> +      resolves[q->first]->add_ambiguous_import(p->first, p->second);
>      dout(10) << " ambig " << p->first << " " << p->second << dendl;
>    }
>
>    // send
> -  mds->send_message_mds(m, who);
> +  for (map<int, MMDSResolve*>::iterator p = resolves.begin();
> +       p != resolves.end();
> +       ++p) {
> +    dout(10) << "sending subtee resolve to mds." << p->first << dendl;
> +    mds->send_message_mds(p->second, p->first);
> +  }
> +  resolves_pending = false;
>  }
>
> -
>  void MDCache::handle_mds_failure(int who)
>  {
>    dout(7) << "handle_mds_failure mds." << who << dendl;
> @@ -2631,7 +2623,6 @@ void MDCache::handle_mds_failure(int who)
>      // slave to the failed node?
>      if (p->second->slave_to_mds == who) {
>        if (p->second->slave_did_prepare()) {
> -       need_resolve_ack.insert(who);
>         dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
>        } else {
>         dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl;
> @@ -3011,7 +3002,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
>
>    need_resolve_ack.erase(from);
>    if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
> -    send_resolves();
> +    send_subtree_resolves();
>      process_delayed_resolve();
>    }
>
> @@ -3078,7 +3069,7 @@ void MDCache::finish_rollback(metareqid_t reqid) {
>      finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
>    need_resolve_rollback.erase(reqid);
>    if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
> -    send_resolves();
> +    send_subtree_resolves();
>      process_delayed_resolve();
>    }
>  }
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index 4634121..10e3dd7 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -328,6 +328,7 @@ protected:
>    friend class ESlaveUpdate;
>    friend class ECommitted;
>
> +  bool resolves_pending;
>    set<int> wants_resolve;   // nodes i need to send my resolve to
>    set<int> got_resolve;     // nodes i got resolves from
>    set<int> need_resolve_ack;   // nodes i need a resolve_ack from
> @@ -367,10 +368,12 @@ public:
>    void finish_ambiguous_import(dirfrag_t dirino);
>    void resolve_start();
>    void send_resolves();
> -  void send_slave_resolve(int who);
> -  void send_resolve_now(int who);
> -  void send_resolve_later(int who);
> -  void maybe_send_pending_resolves();
> +  void send_slave_resolves();
> +  void send_subtree_resolves();
> +  void maybe_send_pending_resolves() {
> +    if (resolves_pending)
> +      send_subtree_resolves();
> +  }
>
>    void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
>                                map<dirfrag_t,vector<dirfrag_t> >& subtrees);
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 13/39] mds: don't send resolve message between active MDS
  2013-03-17 14:51 ` [PATCH 13/39] mds: don't send resolve message between active MDS Yan, Zheng
@ 2013-03-20 21:56   ` Gregory Farnum
  2013-03-21  2:55     ` Yan, Zheng
  0 siblings, 1 reply; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 21:56 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> When MDS cluster is resolving, current behavior is sending subtree resolve
> message to all other MDS and waiting for all other MDS' resolve message.
> The problem is that active MDS can have diffent subtree map due to rename.
> Besides gathering active MDS's resolve messages are also racy. The only
> function for these messages is disambiguate other MDS' import. We can
> replace it by import finish notification.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc  | 12 +++++++++---
>  src/mds/Migrator.cc | 25 +++++++++++++++++++++++--
>  src/mds/Migrator.h  |  3 ++-
>  3 files changed, 34 insertions(+), 6 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index c455a20..73c1d59 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -2517,7 +2517,8 @@ void MDCache::send_subtree_resolves()
>         ++p) {
>      if (*p == mds->whoami)
>        continue;
> -    resolves[*p] = new MMDSResolve;
> +    if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
> +      resolves[*p] = new MMDSResolve;
>    }
>
>    // known
> @@ -2837,7 +2838,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
>           migrator->import_reverse(dir);
>         } else {
>           dout(7) << "ambiguous import succeeded on " << *dir << dendl;
> -         migrator->import_finish(dir);
> +         migrator->import_finish(dir, true);
>         }
>         my_ambiguous_imports.erase(p);  // no longer ambiguous.
>        }
> @@ -3432,7 +3433,12 @@ void MDCache::rejoin_send_rejoins()
>         ++p) {
>      CDir *dir = p->first;
>      assert(dir->is_subtree_root());
> -    assert(!dir->is_ambiguous_dir_auth());
> +    if (dir->is_ambiguous_dir_auth()) {
> +      // exporter is recovering, importer is survivor.

The importer has to be the MDS this code is running on, right?

> +      assert(rejoins.count(dir->authority().first));
> +      assert(!rejoins.count(dir->authority().second));
> +      continue;
> +    }
>
>      // my subtree?
>      if (dir->is_auth())
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index 5e53803..833df12 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -2088,6 +2088,23 @@ void Migrator::import_reverse(CDir *dir)
>    }
>  }
>
> +void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
> +{
> +  dout(7) << "import_notify_finish " << *dir << dendl;
> +
> +  for (set<int>::iterator p = import_bystanders[dir].begin();
> +       p != import_bystanders[dir].end();
> +       ++p) {
> +    MExportDirNotify *notify =
> +      new MExportDirNotify(dir->dirfrag(), false,
> +                          pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
> +                          pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));

I don't think this is quite right — we're notifying them that we've
just finished importing data from somebody, right? And so we know that
we're the auth node...

> +    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
> +      notify->get_bounds().push_back((*i)->dirfrag());
> +    mds->send_message_mds(notify, *p);
> +  }
> +}
> +
>  void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
>  {
>    dout(7) << "import_notify_abort " << *dir << dendl;
> @@ -2183,11 +2200,11 @@ void Migrator::handle_export_finish(MExportDirFinish *m)
>    CDir *dir = cache->get_dirfrag(m->get_dirfrag());
>    assert(dir);
>    dout(7) << "handle_export_finish on " << *dir << dendl;
> -  import_finish(dir);
> +  import_finish(dir, false);
>    m->put();
>  }
>
> -void Migrator::import_finish(CDir *dir)
> +void Migrator::import_finish(CDir *dir, bool notify)
>  {
>    dout(7) << "import_finish on " << *dir << dendl;
>
> @@ -2205,6 +2222,10 @@ void Migrator::import_finish(CDir *dir)
>    // remove pins
>    set<CDir*> bounds;
>    cache->get_subtree_bounds(dir, bounds);
> +
> +  if (notify)
> +    import_notify_finish(dir, bounds);
> +
>    import_remove_pins(dir, bounds);
>
>    map<CInode*, map<client_t,Capability::Export> > cap_imports;
> diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
> index 7988f32..2889a74 100644
> --- a/src/mds/Migrator.h
> +++ b/src/mds/Migrator.h
> @@ -273,12 +273,13 @@ protected:
>    void import_reverse_unfreeze(CDir *dir);
>    void import_reverse_final(CDir *dir);
>    void import_notify_abort(CDir *dir, set<CDir*>& bounds);
> +  void import_notify_finish(CDir *dir, set<CDir*>& bounds);
>    void import_logged_start(dirfrag_t df, CDir *dir, int from,
>                            map<client_t,entity_inst_t> &imported_client_map,
>                            map<client_t,uint64_t>& sseqmap);
>    void handle_export_finish(MExportDirFinish *m);
>  public:
> -  void import_finish(CDir *dir);
> +  void import_finish(CDir *dir, bool notify);
>  protected:
>
>    void handle_export_caps(MExportCaps *m);
> --
> 1.7.11.7
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 14/39] mds: set resolve/rejoin gather MDS set in advance
  2013-03-17 14:51 ` [PATCH 14/39] mds: set resolve/rejoin gather MDS set in advance Yan, Zheng
@ 2013-03-20 22:09   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 22:09 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> For active MDS, it may receive resolve/resolve message before receiving

resolve/rejoin, maybe?
Other than that,
Reviewed-by: Greg Farnum <greg@inktank.com>

> the mdsmap message that claims the MDS cluster is in resolving/rejoning
> state. So instead of set the gather MDS set when receiving the mdsmap.
> set them in advance when detecting MDS' failure.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 41 +++++++++++++++++++----------------------
>  src/mds/MDCache.h  |  5 ++---
>  2 files changed, 21 insertions(+), 25 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 73c1d59..69db1dd 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -2432,18 +2432,17 @@ void MDCache::resolve_start()
>      if (rootdir)
>        adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
>    }
> +  resolve_gather = recovery_set;
> +  resolve_gather.erase(mds->get_nodeid());
> +  rejoin_gather = resolve_gather;
>  }
>
>  void MDCache::send_resolves()
>  {
> -  // reset resolve state
> -  got_resolve.clear();
> -  other_ambiguous_imports.clear();
> -
>    send_slave_resolves();
>    if (!resolve_ack_gather.empty()) {
>      dout(10) << "send_resolves still waiting for resolve ack from ("
> -             << need_resolve_ack << ")" << dendl;
> +            << resolve_ack_gather << ")" << dendl;
>      return;
>    }
>    if (!need_resolve_rollback.empty()) {
> @@ -2495,7 +2494,7 @@ void MDCache::send_slave_resolves()
>         ++p) {
>      dout(10) << "sending slave resolve to mds." << p->first << dendl;
>      mds->send_message_mds(p->second, p->first);
> -    need_resolve_ack.insert(p->first);
> +    resolve_ack_gather.insert(p->first);
>    }
>  }
>
> @@ -2598,16 +2597,15 @@ void MDCache::handle_mds_failure(int who)
>    recovery_set.erase(mds->get_nodeid());
>    dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
>
> -  // adjust my recovery lists
> -  wants_resolve.erase(who);   // MDS will ask again
> -  got_resolve.erase(who);     // i'll get another.
> +  resolve_gather.insert(who);
>    discard_delayed_resolve(who);
>
> +  rejoin_gather.insert(who);
>    rejoin_sent.erase(who);        // i need to send another
>    rejoin_ack_gather.erase(who);  // i'll need/get another.
>
> -  dout(10) << " wants_resolve " << wants_resolve << dendl;
> -  dout(10) << " got_resolve " << got_resolve << dendl;
> +  dout(10) << " resolve_gather " << resolve_gather << dendl;
> +  dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
>    dout(10) << " rejoin_sent " << rejoin_sent << dendl;
>    dout(10) << " rejoin_gather " << rejoin_gather << dendl;
>    dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
> @@ -2788,7 +2786,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
>      return;
>    }
>
> -  if (!need_resolve_ack.empty() || !need_resolve_rollback.empty()) {
> +  if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
>      dout(10) << "delay processing subtree resolve" << dendl;
>      discard_delayed_resolve(from);
>      delayed_resolve[from] = m;
> @@ -2875,7 +2873,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
>    }
>
>    // did i get them all?
> -  got_resolve.insert(from);
> +  resolve_gather.erase(from);
>
>    maybe_resolve_finish();
>
> @@ -2901,12 +2899,12 @@ void MDCache::discard_delayed_resolve(int who)
>
>  void MDCache::maybe_resolve_finish()
>  {
> -  assert(need_resolve_ack.empty());
> +  assert(resolve_ack_gather.empty());
>    assert(need_resolve_rollback.empty());
>
> -  if (got_resolve != recovery_set) {
> -    dout(10) << "maybe_resolve_finish still waiting for more resolves, got ("
> -            << got_resolve << "), need (" << recovery_set << ")" << dendl;
> +  if (!resolve_gather.empty()) {
> +    dout(10) << "maybe_resolve_finish still waiting for resolves ("
> +            << resolve_gather << ")" << dendl;
>      return;
>    } else {
>      dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
> @@ -2926,7 +2924,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
>    dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
>    int from = ack->get_source().num();
>
> -  if (!need_resolve_ack.count(from)) {
> +  if (!resolve_ack_gather.count(from)) {
>      ack->put();
>      return;
>    }
> @@ -3001,8 +2999,8 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
>        assert(p->second->slave_to_mds != from);
>    }
>
> -  need_resolve_ack.erase(from);
> -  if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
> +  resolve_ack_gather.erase(from);
> +  if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
>      send_subtree_resolves();
>      process_delayed_resolve();
>    }
> @@ -3069,7 +3067,7 @@ void MDCache::finish_rollback(metareqid_t reqid) {
>    if (mds->is_resolve())
>      finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
>    need_resolve_rollback.erase(reqid);
> -  if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
> +  if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
>      send_subtree_resolves();
>      process_delayed_resolve();
>    }
> @@ -3417,7 +3415,6 @@ void MDCache::rejoin_send_rejoins()
>      if (*p == mds->get_nodeid())  continue;  // nothing to myself!
>      if (rejoin_sent.count(*p)) continue;     // already sent a rejoin to this node!
>      if (mds->is_rejoin()) {
> -      rejoin_gather.insert(*p);
>        rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
>        rejoins[*p]->copy_cap_exports(cap_export_bl);
>      } else if (mds->mdsmap->is_rejoin(*p))
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index 10e3dd7..278debf 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -329,9 +329,8 @@ protected:
>    friend class ECommitted;
>
>    bool resolves_pending;
> -  set<int> wants_resolve;   // nodes i need to send my resolve to
> -  set<int> got_resolve;     // nodes i got resolves from
> -  set<int> need_resolve_ack;   // nodes i need a resolve_ack from
> +  set<int> resolve_gather;     // nodes i need resolves from
> +  set<int> resolve_ack_gather; // nodes i need a resolve_ack from
>    map<metareqid_t, int> need_resolve_rollback;  // rollbacks i'm writing to the journal
>    map<int, MMDSResolve*> delayed_resolve;
>
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 15/39] mds: don't send MDentry{Link,Unlink} before receiving cache rejoin
  2013-03-17 14:51 ` [PATCH 15/39] mds: don't send MDentry{Link,Unlink} before receiving cache rejoin Yan, Zheng
@ 2013-03-20 22:17   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 22:17 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> The active MDS calls MDCache::rejoin_scour_survivor_replicas() when it
> receives the cache rejoin message. The function will remove the objects
> replicated by MDentry{Link,Unlink} from replica map.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 13 ++++++++++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 69db1dd..f102205 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -3893,6 +3893,8 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>      }
>    }
>
> +  assert(rejoin_gather.count(from));
> +  rejoin_gather.erase(from);
>    if (survivor) {
>      // survivor.  do everything now.
>      for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = weak->inode_scatterlocks.begin();
> @@ -3911,8 +3913,6 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>        mds->locker->eval_gather(*p);
>    } else {
>      // done?
> -    assert(rejoin_gather.count(from));
> -    rejoin_gather.erase(from);
>      if (rejoin_gather.empty()) {
>        rejoin_gather_finish();
>      } else {
> @@ -9582,7 +9582,9 @@ void MDCache::send_dentry_link(CDentry *dn)
>    for (map<int,int>::iterator p = dn->replicas_begin();
>         p != dn->replicas_end();
>         ++p) {
> -    if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN)
> +    if (mds->mdsmap->get_state(p->first) < MDSMap::STATE_REJOIN ||
> +       (mds->mdsmap->get_state(p->first) == MDSMap::STATE_REJOIN &&
> +        rejoin_gather.count(p->first)))
>        continue;
>      CDentry::linkage_t *dnl = dn->get_linkage();
>      MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(),
> @@ -9668,6 +9670,11 @@ void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequest *mdr)
>      if (mdr && mdr->more()->witnessed.count(it->first))
>        continue;
>
> +    if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
> +       (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
> +        rejoin_gather.count(it->first)))
> +      continue;
> +
>      MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name);
>      if (straydn)
>        replicate_stray(straydn, it->first, unlink->straybl);
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 10/39] mds: unify slave request waiting
  2013-03-17 14:51 ` [PATCH 10/39] mds: unify slave request waiting Yan, Zheng
@ 2013-03-20 22:52   ` Sage Weil
  0 siblings, 0 replies; 117+ messages in thread
From: Sage Weil @ 2013-03-20 22:52 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, greg

Much simpler!

Reviewed-by: Sage Weil <sage@inktank.com>

On Sun, 17 Mar 2013, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> 
> When requesting remote xlock or remote wrlock, the master request is
> put into lock object's REMOTEXLOCK waiting queue. The problem is that
> remote wrlock's target can be different from lock's auth MDS. When
> the lock's auth MDS recovers, MDCache::handle_mds_recovery() may wake
> incorrect request. So just unify slave request waiting, dispatch the
> master request when receiving slave request reply.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Locker.cc | 49 ++++++++++++++++++++++---------------------------
>  src/mds/Server.cc | 12 ++++++++++--
>  2 files changed, 32 insertions(+), 29 deletions(-)
> 
> diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
> index d06a9cc..0055a19 100644
> --- a/src/mds/Locker.cc
> +++ b/src/mds/Locker.cc
> @@ -544,8 +544,6 @@ void Locker::cancel_locking(Mutation *mut, set<CInode*> *pneed_issue)
>        if (need_issue)
>  	pneed_issue->insert(static_cast<CInode *>(lock->get_parent()));
>      }
> -  } else {
> -    lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
>    }
>    mut->finish_locking(lock);
>  }
> @@ -1326,18 +1324,16 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut)
>    }
>  
>    // send lock request
> -  if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) {
> -    mut->start_locking(lock, target);
> -    mut->more()->slaves.insert(target);
> -    MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
> -					       MMDSSlaveRequest::OP_WRLOCK);
> -    r->set_lock_type(lock->get_type());
> -    lock->get_parent()->set_object_info(r->get_object_info());
> -    mds->send_message_mds(r, target);
> -  }
> -  
> -  // wait
> -  lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mut));
> +  mut->start_locking(lock, target);
> +  mut->more()->slaves.insert(target);
> +  MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
> +					     MMDSSlaveRequest::OP_WRLOCK);
> +  r->set_lock_type(lock->get_type());
> +  lock->get_parent()->set_object_info(r->get_object_info());
> +  mds->send_message_mds(r, target);
> +
> +  assert(mut->more()->waiting_on_slave.count(target) == 0);
> +  mut->more()->waiting_on_slave.insert(target);
>  }
>  
>  void Locker::remote_wrlock_finish(SimpleLock *lock, int target, Mutation *mut)
> @@ -1411,19 +1407,18 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut)
>      }
>      
>      // send lock request
> -    if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) {
> -      int auth = lock->get_parent()->authority().first;
> -      mut->more()->slaves.insert(auth);
> -      mut->start_locking(lock, auth);
> -      MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
> -						 MMDSSlaveRequest::OP_XLOCK);
> -      r->set_lock_type(lock->get_type());
> -      lock->get_parent()->set_object_info(r->get_object_info());
> -      mds->send_message_mds(r, auth);
> -    }
> -    
> -    // wait
> -    lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mut));
> +    int auth = lock->get_parent()->authority().first;
> +    mut->more()->slaves.insert(auth);
> +    mut->start_locking(lock, auth);
> +    MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
> +					       MMDSSlaveRequest::OP_XLOCK);
> +    r->set_lock_type(lock->get_type());
> +    lock->get_parent()->set_object_info(r->get_object_info());
> +    mds->send_message_mds(r, auth);
> +
> +    assert(mut->more()->waiting_on_slave.count(auth) == 0);
> +    mut->more()->waiting_on_slave.insert(auth);
> +
>      return false;
>    }
>  }
> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
> index 6d0519f..4c4c86b 100644
> --- a/src/mds/Server.cc
> +++ b/src/mds/Server.cc
> @@ -1371,7 +1371,11 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
>        mdr->locks.insert(lock);
>        mdr->finish_locking(lock);
>        lock->get_xlock(mdr, mdr->get_client());
> -      lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
> +
> +      assert(mdr->more()->waiting_on_slave.count(from));
> +      mdr->more()->waiting_on_slave.erase(from);
> +      assert(mdr->more()->waiting_on_slave.empty());
> +      dispatch_client_request(mdr);
>      }
>      break;
>      
> @@ -1385,7 +1389,11 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
>        mdr->remote_wrlocks[lock] = from;
>        mdr->locks.insert(lock);
>        mdr->finish_locking(lock);
> -      lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
> +
> +      assert(mdr->more()->waiting_on_slave.count(from));
> +      mdr->more()->waiting_on_slave.erase(from);
> +      assert(mdr->more()->waiting_on_slave.empty());
> +      dispatch_client_request(mdr);
>      }
>      break;
>  
> -- 
> 1.7.11.7
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 16/39] mds: send cache rejoin messages after gathering all resolves
  2013-03-17 14:51 ` [PATCH 16/39] mds: send cache rejoin messages after gathering all resolves Yan, Zheng
@ 2013-03-20 22:57   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 22:57 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 10 ++++++++++
>  src/mds/MDCache.h  |  5 +++++
>  2 files changed, 15 insertions(+)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index f102205..6853bf1 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -2914,6 +2914,8 @@ void MDCache::maybe_resolve_finish()
>        recalc_auth_bits();
>        trim_non_auth();
>        mds->resolve_done();
> +    } else {
> +      maybe_send_pending_rejoins();
>      }
>    }
>  }
> @@ -3398,6 +3400,13 @@ void MDCache::rejoin_send_rejoins()
>  {
>    dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
>
> +  if (!resolve_gather.empty()) {
> +    dout(7) << "rejoin_send_rejoins still waiting for resolves ("
> +           << resolve_gather << ")" << dendl;
> +    rejoins_pending = true;
> +    return;
> +  }
> +
>    map<int, MMDSCacheRejoin*> rejoins;
>
>    // encode cap list once.
> @@ -3571,6 +3580,7 @@ void MDCache::rejoin_send_rejoins()
>      mds->send_message_mds(p->second, p->first);
>    }
>    rejoin_ack_gather.insert(mds->whoami);   // we need to complete rejoin_gather_finish, too
> +  rejoins_pending = false;
>
>    // nothing?
>    if (mds->is_rejoin() && rejoins.empty()) {
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index 278debf..379f715 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -383,6 +383,7 @@ public:
>
>  protected:
>    // [rejoin]
> +  bool rejoins_pending;
>    set<int> rejoin_gather;      // nodes from whom i need a rejoin
>    set<int> rejoin_sent;        // nodes i sent a rejoin to
>    set<int> rejoin_ack_gather;  // nodes from whom i need a rejoin ack
> @@ -417,6 +418,10 @@ protected:
>    void handle_cache_rejoin_full(MMDSCacheRejoin *m);
>    void rejoin_send_acks();
>    void rejoin_trim_undef_inodes();
> +  void maybe_send_pending_rejoins() {
> +    if (rejoins_pending)
> +      rejoin_send_rejoins();
> +  }
>  public:
>    void rejoin_gather_finish();
>    void rejoin_send_rejoins();
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 17/39] mds: send resolve acks after master updates are safely logged
  2013-03-17 14:51 ` [PATCH 17/39] mds: send resolve acks after master updates are safely logged Yan, Zheng
@ 2013-03-20 22:58   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 22:58 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 33 +++++++++++++++++++++++++++++----
>  src/mds/MDCache.h  |  7 ++++++-
>  src/mds/Server.cc  |  9 +++++++++
>  src/mds/journal.cc |  2 +-
>  4 files changed, 45 insertions(+), 6 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 6853bf1..9b37b1e 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -2177,6 +2177,17 @@ void MDCache::committed_master_slave(metareqid_t r, int from)
>      log_master_commit(r);
>  }
>
> +void MDCache::logged_master_update(metareqid_t reqid)
> +{
> +  dout(10) << "logged_master_update " << reqid << dendl;
> +  assert(uncommitted_masters.count(reqid));
> +  uncommitted_masters[reqid].safe = true;
> +  if (pending_masters.count(reqid)) {
> +    pending_masters.erase(reqid);
> +    if (pending_masters.empty())
> +      process_delayed_resolve();
> +  }
> +}
>
>  /*
>   * The mds could crash after receiving all slaves' commit acknowledgement,
> @@ -2764,8 +2775,23 @@ void MDCache::handle_resolve(MMDSResolve *m)
>      return;
>    }
>
> +  discard_delayed_resolve(from);
> +
>    // ambiguous slave requests?
>    if (!m->slave_requests.empty()) {
> +    for (vector<metareqid_t>::iterator p = m->slave_requests.begin();
> +        p != m->slave_requests.end();
> +        ++p) {
> +      if (uncommitted_masters.count(*p) && !uncommitted_masters[*p].safe)
> +       pending_masters.insert(*p);
> +    }
> +
> +    if (!pending_masters.empty()) {
> +      dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
> +      delayed_resolve[from] = m;
> +      return;
> +    }
> +
>      MMDSResolveAck *ack = new MMDSResolveAck;
>      for (vector<metareqid_t>::iterator p = m->slave_requests.begin();
>          p != m->slave_requests.end();
> @@ -2788,7 +2814,6 @@ void MDCache::handle_resolve(MMDSResolve *m)
>
>    if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
>      dout(10) << "delay processing subtree resolve" << dendl;
> -    discard_delayed_resolve(from);
>      delayed_resolve[from] = m;
>      return;
>    }
> @@ -2883,10 +2908,10 @@ void MDCache::handle_resolve(MMDSResolve *m)
>  void MDCache::process_delayed_resolve()
>  {
>    dout(10) << "process_delayed_resolve" << dendl;
> -  for (map<int, MMDSResolve *>::iterator p = delayed_resolve.begin();
> -       p != delayed_resolve.end(); ++p)
> +  map<int, MMDSResolve*> tmp;
> +  tmp.swap(delayed_resolve);
> +  for (map<int, MMDSResolve*>::iterator p = tmp.begin(); p != tmp.end(); ++p)
>      handle_resolve(p->second);
> -  delayed_resolve.clear();
>  }
>
>  void MDCache::discard_delayed_resolve(int who)
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index 379f715..8f262b9 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -281,14 +281,16 @@ public:
>                                 snapid_t follows=CEPH_NOSNAP);
>
>    // slaves
> -  void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<int> &slaves) {
> +  void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<int> &slaves, bool safe=false) {
>      uncommitted_masters[reqid].ls = ls;
>      uncommitted_masters[reqid].slaves = slaves;
> +    uncommitted_masters[reqid].safe = safe;
>    }
>    void wait_for_uncommitted_master(metareqid_t reqid, Context *c) {
>      uncommitted_masters[reqid].waiters.push_back(c);
>    }
>    void log_master_commit(metareqid_t reqid);
> +  void logged_master_update(metareqid_t reqid);
>    void _logged_master_commit(metareqid_t reqid, LogSegment *ls, list<Context*> &waiters);
>    void committed_master_slave(metareqid_t r, int from);
>    void finish_committed_masters();
> @@ -320,9 +322,12 @@ protected:
>      set<int> slaves;
>      LogSegment *ls;
>      list<Context*> waiters;
> +    bool safe;
>    };
>    map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
>
> +  set<metareqid_t>             pending_masters;
> +
>    //map<metareqid_t, bool>     ambiguous_slave_updates;         // for log trimming.
>    //map<metareqid_t, Context*> waiting_for_slave_update_commit;
>    friend class ESlaveUpdate;
> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
> index 8e89e4c..1330f11 100644
> --- a/src/mds/Server.cc
> +++ b/src/mds/Server.cc
> @@ -4463,6 +4463,9 @@ void Server::_link_remote_finish(MDRequest *mdr, bool inc,
>
>    assert(g_conf->mds_kill_link_at != 3);
>
> +  if (!mdr->more()->witnessed.empty())
> +    mdcache->logged_master_update(mdr->reqid);
> +
>    if (inc) {
>      // link the new dentry
>      dn->pop_projected_linkage();
> @@ -5073,6 +5076,9 @@ void Server::_unlink_local_finish(MDRequest *mdr,
>  {
>    dout(10) << "_unlink_local_finish " << *dn << dendl;
>
> +  if (!mdr->more()->witnessed.empty())
> +    mdcache->logged_master_update(mdr->reqid);
> +
>    // unlink main dentry
>    dn->get_dir()->unlink_inode(dn);
>    dn->pop_projected_linkage();
> @@ -5881,6 +5887,9 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe
>  {
>    dout(10) << "_rename_finish " << *mdr << dendl;
>
> +  if (!mdr->more()->witnessed.empty())
> +    mdcache->logged_master_update(mdr->reqid);
> +
>    // apply
>    _rename_apply(mdr, srcdn, destdn, straydn);
>
> diff --git a/src/mds/journal.cc b/src/mds/journal.cc
> index 3375e40..6475eec 100644
> --- a/src/mds/journal.cc
> +++ b/src/mds/journal.cc
> @@ -1738,7 +1738,7 @@ void EUpdate::replay(MDS *mds)
>      dout(10) << "EUpdate.replay " << reqid << " had slaves, expecting a matching ECommitted" << dendl;
>      _segment->uncommitted_masters.insert(reqid);
>      set<int> slaves;
> -    mds->mdcache->add_uncommitted_master(reqid, _segment, slaves);
> +    mds->mdcache->add_uncommitted_master(reqid, _segment, slaves, true);
>    }
>
>    if (client_map.length()) {
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 19/39] mds: remove MDCache::rejoin_fetch_dirfrags()
  2013-03-17 14:51 ` [PATCH 19/39] mds: remove MDCache::rejoin_fetch_dirfrags() Yan, Zheng
@ 2013-03-20 22:58   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 22:58 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Nice.
Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> In commit 77946dcdae (mds: fetch missing inodes from disk), I introduced
> MDCache::rejoin_fetch_dirfrags(). But it basicly duplicates the function
> of MDCache::open_undef_dirfrags(), so just remove rejoin_fetch_dirfrags()
> and make open_undef_dirfrags() also handle undefined inodes.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/CDir.cc    |  70 +++++++++++--------
>  src/mds/MDCache.cc | 193 +++++++++++++++++------------------------------------
>  src/mds/MDCache.h  |   5 +-
>  3 files changed, 107 insertions(+), 161 deletions(-)
>
> diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
> index 231630e..af0ae9c 100644
> --- a/src/mds/CDir.cc
> +++ b/src/mds/CDir.cc
> @@ -1553,33 +1553,32 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
>        if (stale)
>         continue;
>
> +      bool undef_inode = false;
>        if (dn) {
> -        if (dn->get_linkage()->get_inode() == 0) {
> -          dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
> -        } else {
> -          dout(12) << "_fetched  had dentry " << *dn << dendl;
> -        }
> -      } else {
> +       CInode *in = dn->get_linkage()->get_inode();
> +       if (in) {
> +         dout(12) << "_fetched  had dentry " << *dn << dendl;
> +         if (in->state_test(CInode::STATE_REJOINUNDEF)) {
> +           assert(cache->mds->is_rejoin());
> +           assert(in->vino() == vinodeno_t(inode.ino, last));
> +           in->state_clear(CInode::STATE_REJOINUNDEF);
> +           cache->opened_undef_inode(in);
> +           undef_inode = true;
> +         }
> +       } else
> +         dout(12) << "_fetched  had NEG dentry " << *dn << dendl;
> +      }
> +
> +      if (!dn || undef_inode) {
>         // add inode
>         CInode *in = cache->get_inode(inode.ino, last);
> -       if (in) {
> -         dout(0) << "_fetched  badness: got (but i already had) " << *in
> -                 << " mode " << in->inode.mode
> -                 << " mtime " << in->inode.mtime << dendl;
> -         string dirpath, inopath;
> -         this->inode->make_path_string(dirpath);
> -         in->make_path_string(inopath);
> -         clog.error() << "loaded dup inode " << inode.ino
> -           << " [" << first << "," << last << "] v" << inode.version
> -           << " at " << dirpath << "/" << dname
> -           << ", but inode " << in->vino() << " v" << in->inode.version
> -           << " already exists at " << inopath << "\n";
> -         continue;
> -       } else {
> -         // inode
> -         in = new CInode(cache, true, first, last);
> -         in->inode = inode;
> +       if (!in || undef_inode) {
> +         if (undef_inode)
> +           in->first = first;
> +         else
> +           in = new CInode(cache, true, first, last);
>
> +         in->inode = inode;
>           // symlink?
>           if (in->is_symlink())
>             in->symlink = symlink;
> @@ -1591,11 +1590,13 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
>           if (snaps)
>             in->purge_stale_snap_data(*snaps);
>
> -         // add
> -         cache->add_inode( in );
> -
> -         // link
> -         dn = add_primary_dentry(dname, in, first, last);
> +         if (undef_inode) {
> +           if (inode.anchored)
> +             dn->adjust_nested_anchors(1);
> +         } else {
> +           cache->add_inode( in ); // add
> +           dn = add_primary_dentry(dname, in, first, last); // link
> +         }
>           dout(12) << "_fetched  got " << *dn << " " << *in << dendl;
>
>           if (in->inode.is_dirty_rstat())
> @@ -1604,6 +1605,19 @@ void CDir::_fetched(bufferlist &bl, const string& want_dn)
>           //in->hack_accessed = false;
>           //in->hack_load_stamp = ceph_clock_now(g_ceph_context);
>           //num_new_inodes_loaded++;
> +       } else {
> +         dout(0) << "_fetched  badness: got (but i already had) " << *in
> +                 << " mode " << in->inode.mode
> +                 << " mtime " << in->inode.mtime << dendl;
> +         string dirpath, inopath;
> +         this->inode->make_path_string(dirpath);
> +         in->make_path_string(inopath);
> +         clog.error() << "loaded dup inode " << inode.ino
> +           << " [" << first << "," << last << "] v" << inode.version
> +           << " at " << dirpath << "/" << dname
> +           << ", but inode " << in->vino() << " v" << in->inode.version
> +           << " already exists at " << inopath << "\n";
> +         continue;
>         }
>        }
>      } else {
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index d934020..008a8a2 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -4178,7 +4178,6 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
>
>  CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
>  {
> -  assert(0);
>    CInode *in = new CInode(this, true, 1, last);
>    in->inode.ino = ino;
>    in->state_set(CInode::STATE_REJOINUNDEF);
> @@ -4190,16 +4189,13 @@ CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
>
>  CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
>  {
> -  assert(0);
>    CInode *in = get_inode(df.ino);
> -  if (!in) {
> +  if (!in)
>      in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
> -    if (!in->is_dir()) {
> -      assert(in->state_test(CInode::STATE_REJOINUNDEF));
> -      in->inode.mode = S_IFDIR;
> -    }
> +  if (!in->is_dir()) {
> +    assert(in->state_test(CInode::STATE_REJOINUNDEF));
> +    in->inode.mode = S_IFDIR;
>    }
> -  assert(in->is_dir());
>    CDir *dir = in->get_or_open_dirfrag(this, df.frag);
>    dir->state_set(CDir::STATE_REJOINUNDEF);
>    rejoin_undef_dirfrags.insert(dir);
> @@ -4207,81 +4203,6 @@ CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
>    return dir;
>  }
>
> -bool MDCache::rejoin_fetch_dirfrags(MMDSCacheRejoin *strong)
> -{
> -  int skipped = 0;
> -  set<CDir*> fetch_queue;
> -  for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = strong->strong_dirfrags.begin();
> -       p != strong->strong_dirfrags.end();
> -       ++p) {
> -    CInode *diri = get_inode(p->first.ino);
> -    if (!diri) {
> -      skipped++;
> -      continue;
> -    }
> -    CDir *dir = diri->get_dirfrag(p->first.frag);
> -    if (dir && dir->is_complete())
> -      continue;
> -
> -    set<CDir*> frags;
> -    bool refragged = false;
> -    if (!dir) {
> -      if (diri->dirfragtree.is_leaf(p->first.frag))
> -       dir = diri->get_or_open_dirfrag(this, p->first.frag);
> -      else {
> -       list<frag_t> ls;
> -       diri->dirfragtree.get_leaves_under(p->first.frag, ls);
> -       if (ls.empty())
> -         ls.push_back(diri->dirfragtree[p->first.frag.value()]);
> -       for (list<frag_t>::iterator q = ls.begin(); q != ls.end(); ++q) {
> -         dir = diri->get_or_open_dirfrag(this, p->first.frag);
> -         frags.insert(dir);
> -       }
> -       refragged = true;
> -      }
> -    }
> -
> -    map<string_snap_t,MMDSCacheRejoin::dn_strong>& dmap = strong->strong_dentries[p->first];
> -    for (map<string_snap_t,MMDSCacheRejoin::dn_strong>::iterator q = dmap.begin();
> -       q != dmap.end();
> -       ++q) {
> -      if (!q->second.is_primary())
> -       continue;
> -      CDentry *dn;
> -      if (!refragged)
> -       dn = dir->lookup(q->first.name, q->first.snapid);
> -      else {
> -       frag_t fg = diri->pick_dirfrag(q->first.name);
> -       dir = diri->get_dirfrag(fg);
> -       assert(dir);
> -       dn = dir->lookup(q->first.name, q->first.snapid);
> -      }
> -      if (!dn) {
> -       fetch_queue.insert(dir);
> -       if (!refragged)
> -         break;
> -       frags.erase(dir);
> -       if (frags.empty())
> -         break;
> -      }
> -    }
> -  }
> -
> -  if (!fetch_queue.empty()) {
> -    dout(10) << "rejoin_fetch_dirfrags " << fetch_queue.size() << " dirfrags" << dendl;
> -    strong->get();
> -    C_GatherBuilder gather(g_ceph_context, new C_MDS_RetryMessage(mds, strong));
> -    for (set<CDir*>::iterator p = fetch_queue.begin(); p != fetch_queue.end(); ++p) {
> -      CDir *dir = *p;
> -      dir->fetch(gather.new_sub());
> -    }
> -    gather.activate();
> -    return true;
> -  }
> -  assert(!skipped);
> -  return false;
> -}
> -
>  /* This functions DOES NOT put the passed message before returning */
>  void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>  {
> @@ -4290,11 +4211,6 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>    // only a recovering node will get a strong rejoin.
>    assert(mds->is_rejoin());
>
> -  if (rejoin_fetch_dirfrags(strong))
> -    return;
> -
> -  MMDSCacheRejoin *missing = 0;  // if i'm missing something..
> -
>    // assimilate any potentially dirty scatterlock state
>    for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
>         p != strong->inode_scatterlocks.end();
> @@ -4319,12 +4235,16 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>         p != strong->strong_dirfrags.end();
>         ++p) {
>      CInode *diri = get_inode(p->first.ino);
> +    if (!diri)
> +      diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP);
>      CDir *dir = diri->get_dirfrag(p->first.frag);
>      bool refragged = false;
>      if (dir) {
>        dout(10) << " have " << *dir << dendl;
>      } else {
> -      if (diri->dirfragtree.is_leaf(p->first.frag))
> +      if (diri->state_test(CInode::STATE_REJOINUNDEF))
> +       dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
> +      else if (diri->dirfragtree.is_leaf(p->first.frag))
>         dir = rejoin_invent_dirfrag(p->first);
>      }
>      if (dir) {
> @@ -4369,15 +4289,9 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>         } else if (q->second.is_null()) {
>           dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
>         } else {
> -         assert(0);
>           CInode *in = get_inode(q->second.ino, q->first.snapid);
>           if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid);
>           dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid);
> -
> -         dout(10) << " missing " << q->second.ino << "." << q->first.snapid << dendl;
> -         if (!missing)
> -           missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING);
> -         missing->add_weak_inode(vinodeno_t(q->second.ino, q->first.snapid));  // we want it back!
>         }
>         dout(10) << " invented " << *dn << dendl;
>        }
> @@ -4513,19 +4427,15 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>      in->add_replica(from);
>    }
>
> -  // send missing?
> -  if (missing) {
> -    // we expect a FULL soon.
> -    mds->send_message(missing, strong->get_connection());
> +
> +
> +  // done?
> +  assert(rejoin_gather.count(from));
> +  rejoin_gather.erase(from);
> +  if (rejoin_gather.empty()) {
> +    rejoin_gather_finish();
>    } else {
> -    // done?
> -    assert(rejoin_gather.count(from));
> -    rejoin_gather.erase(from);
> -    if (rejoin_gather.empty()) {
> -      rejoin_gather_finish();
> -    } else {
> -      dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
> -    }
> +    dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
>    }
>  }
>
> @@ -4800,7 +4710,8 @@ void MDCache::rejoin_gather_finish()
>    dout(10) << "rejoin_gather_finish" << dendl;
>    assert(mds->is_rejoin());
>
> -  rejoin_trim_undef_inodes();
> +  if (open_undef_inodes_dirfrags())
> +    return;
>
>    // fetch paths?
>    //  do this before ack, since some inodes we may have already gotten
> @@ -5152,44 +5063,62 @@ void MDCache::open_snap_parents()
>      gather.set_finisher(new C_MDC_OpenSnapParents(this));
>      gather.activate();
>    } else {
> +    assert(rejoin_waiters.empty());
>      assert(missing_snap_parents.empty());
>      assert(reconnected_snaprealms.empty());
>      dout(10) << "open_snap_parents - all open" << dendl;
>      do_delayed_cap_imports();
>
> -    open_undef_dirfrags();
> +    start_files_to_recover(rejoin_recover_q, rejoin_check_q);
> +    mds->rejoin_done();
>    }
>  }
>
> -struct C_MDC_OpenUndefDirfragsFinish : public Context {
> -  MDCache *cache;
> -  C_MDC_OpenUndefDirfragsFinish(MDCache *c) : cache(c) {}
> -  void finish(int r) {
> -    cache->open_undef_dirfrags();
> +bool MDCache::open_undef_inodes_dirfrags()
> +{
> +  dout(10) << "open_undef_inodes_dirfrags "
> +          << rejoin_undef_inodes.size() << " inodes "
> +          << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
> +
> +  set<CDir*> fetch_queue = rejoin_undef_dirfrags;
> +
> +  for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
> +       p != rejoin_undef_inodes.end();
> +       ++p) {
> +    CInode *in = *p;
> +    assert(!in->is_base());
> +    fetch_queue.insert(in->get_parent_dir());
>    }
> -};
>
> -void MDCache::open_undef_dirfrags()
> -{
> -  dout(10) << "open_undef_dirfrags " << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
> -
> -  C_GatherBuilder gather(g_ceph_context);
> -  for (set<CDir*>::iterator p = rejoin_undef_dirfrags.begin();
> -       p != rejoin_undef_dirfrags.end();
> +  if (fetch_queue.empty())
> +    return false;
> +
> +  C_GatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this));
> +  for (set<CDir*>::iterator p = fetch_queue.begin();
> +       p != fetch_queue.end();
>         ++p) {
>      CDir *dir = *p;
> +    CInode *diri = dir->get_inode();
> +    if (diri->state_test(CInode::STATE_REJOINUNDEF))
> +      continue;
> +    if (dir->state_test(CDir::STATE_REJOINUNDEF) && dir->get_frag() == frag_t()) {
> +      rejoin_undef_dirfrags.erase(dir);
> +      dir->state_clear(CDir::STATE_REJOINUNDEF);
> +      diri->force_dirfrags();
> +      list<CDir*> ls;
> +      diri->get_dirfrags(ls);
> +      for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
> +       rejoin_undef_dirfrags.insert(*q);
> +       (*q)->state_set(CDir::STATE_REJOINUNDEF);
> +       (*q)->fetch(gather.new_sub());
> +      }
> +      continue;
> +    }
>      dir->fetch(gather.new_sub());
>    }
> -
> -  if (gather.has_subs()) {
> -    gather.set_finisher(new C_MDC_OpenUndefDirfragsFinish(this));
> -    gather.activate();
> -  }
> -  else {
> -    start_files_to_recover(rejoin_recover_q, rejoin_check_q);
> -    mds->queue_waiters(rejoin_waiters);
> -    mds->rejoin_done();
> -  }
> +  assert(gather.has_subs());
> +  gather.activate();
> +  return true;
>  }
>
>  void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq)
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index a05ced7..85f5d65 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -496,10 +496,13 @@ public:
>    void check_realm_past_parents(SnapRealm *realm);
>    void open_snap_parents();
>
> -  void open_undef_dirfrags();
> +  bool open_undef_inodes_dirfrags();
>    void opened_undef_dirfrag(CDir *dir) {
>      rejoin_undef_dirfrags.erase(dir);
>    }
> +  void opened_undef_inode(CInode *in) {
> +    rejoin_undef_inodes.erase(in);
> +  }
>
>    void reissue_all_caps();
>
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong
  2013-03-17 14:51 ` [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong Yan, Zheng
@ 2013-03-20 23:26   ` Gregory Farnum
  2013-03-20 23:36     ` Sage Weil
  0 siblings, 1 reply; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 23:26 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> So the recovering MDS can properly handle cache expire messages.
> Also increase the nonce value when sending the cache rejoin acks.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc             | 35 +++++++++++++++++++++++------------
>  src/messages/MMDSCacheRejoin.h | 11 +++++++----
>  2 files changed, 30 insertions(+), 16 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 008a8a2..8ba676e 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -3538,6 +3538,7 @@ void MDCache::rejoin_send_rejoins()
>        if (p->first == 0 && root) {
>         p->second->add_weak_inode(root->vino());
>         p->second->add_strong_inode(root->vino(),
> +                                   root->get_replica_nonce(),
>                                     root->get_caps_wanted(),
>                                     root->filelock.get_state(),
>                                     root->nestlock.get_state(),
> @@ -3551,6 +3552,7 @@ void MDCache::rejoin_send_rejoins()
>        if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
>         p->second->add_weak_inode(in->vino());
>         p->second->add_strong_inode(in->vino(),
> +                                   in->get_replica_nonce(),
>                                     in->get_caps_wanted(),
>                                     in->filelock.get_state(),
>                                     in->nestlock.get_state(),
> @@ -3709,6 +3711,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
>         CInode *in = dnl->get_inode();
>         dout(15) << " add_strong_inode " << *in << dendl;
>         rejoin->add_strong_inode(in->vino(),
> +                                in->get_replica_nonce(),
>                                  in->get_caps_wanted(),
>                                  in->filelock.get_state(),
>                                  in->nestlock.get_state(),
> @@ -4248,7 +4251,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>         dir = rejoin_invent_dirfrag(p->first);
>      }
>      if (dir) {
> -      dir->add_replica(from);
> +      dir->add_replica(from, p->second.nonce);
>        dir->dir_rep = p->second.dir_rep;
>      } else {
>        dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
> @@ -4263,7 +4266,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>           dir = rejoin_invent_dirfrag(p->first);
>         else
>           dout(10) << " have(approx) " << *dir << dendl;
> -       dir->add_replica(from);
> +       dir->add_replica(from, p->second.nonce);
>         dir->dir_rep = p->second.dir_rep;
>        }
>        refragged = true;
> @@ -4327,7 +4330,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>         mdr->locks.insert(&dn->lock);
>        }
>
> -      dn->add_replica(from);
> +      dn->add_replica(from, q->second.nonce);
>        dout(10) << " have " << *dn << dendl;
>
>        // inode?
> @@ -4412,7 +4415,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>           dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
>         }
>
> -       in->add_replica(from);
> +       in->add_replica(from, p->second.nonce);
>         dout(10) << " have " << *in << dendl;
>        }
>      }
> @@ -5176,7 +5179,7 @@ void MDCache::rejoin_send_acks()
>        for (map<int,int>::iterator r = dir->replicas_begin();
>            r != dir->replicas_end();
>            ++r)
> -       ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second, dir->dir_rep);
> +       ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
>
>        for (CDir::map_t::iterator q = dir->items.begin();
>            q != dir->items.end();
> @@ -5192,7 +5195,7 @@ void MDCache::rejoin_send_acks()
>                                            dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
>                                            dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
>                                            dnl->is_remote() ? dnl->get_remote_d_type():0,
> -                                          r->second,
> +                                          ++r->second,
>                                            dn->lock.get_replica_state());
>
>         if (!dnl->is_primary())
> @@ -5205,7 +5208,7 @@ void MDCache::rejoin_send_acks()
>              r != in->replicas_end();
>              ++r) {
>           ack[r->first]->add_inode_base(in);
> -         ack[r->first]->add_inode_locks(in, r->second);
> +         ack[r->first]->add_inode_locks(in, ++r->second);
>         }
>
>         // subdirs in this subtree?
> @@ -5220,14 +5223,14 @@ void MDCache::rejoin_send_acks()
>          r != root->replicas_end();
>          ++r) {
>        ack[r->first]->add_inode_base(root);
> -      ack[r->first]->add_inode_locks(root, r->second);
> +      ack[r->first]->add_inode_locks(root, ++r->second);
>      }
>    if (myin)
>      for (map<int,int>::iterator r = myin->replicas_begin();
>          r != myin->replicas_end();
>          ++r) {
>        ack[r->first]->add_inode_base(myin);
> -      ack[r->first]->add_inode_locks(myin, r->second);
> +      ack[r->first]->add_inode_locks(myin, ++r->second);
>      }
>
>    // include inode base for any inodes whose scatterlocks may have updated
> @@ -5728,6 +5731,12 @@ void MDCache::send_expire_messages(map<int, MCacheExpire*>& expiremap)
>    for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
>         it != expiremap.end();
>         ++it) {
> +    if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
> +       (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
> +        rejoin_sent.count(it->first) == 0)) {
> +      it->second->put();
> +      continue;
> +    }
>      dout(7) << "sending cache_expire to " << it->first << dendl;
>      mds->send_message_mds(it->second, it->first);
>    }
> @@ -9640,9 +9649,11 @@ void MDCache::handle_dentry_link(MDentryLink *m)
>      CInode *in = add_replica_inode(p, NULL, finished);
>      assert(in->get_num_ref() == 0);
>      assert(in->get_parent_dn() == NULL);
> -    MCacheExpire* expire = new MCacheExpire(mds->get_nodeid());
> -    expire->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
> -    mds->send_message_mds(expire, m->get_source().num());
> +    map<int, MCacheExpire*> expiremap;
> +    int from = m->get_source().num();
> +    expiremap[from] = new MCacheExpire(mds->get_nodeid());
> +    expiremap[from]->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
> +    send_expire_messages(expiremap);
>      remove_inode(in);
>    }
>
> diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
> index 825400d..b88f551 100644
> --- a/src/messages/MMDSCacheRejoin.h
> +++ b/src/messages/MMDSCacheRejoin.h
> @@ -43,19 +43,22 @@ class MMDSCacheRejoin : public Message {
>
>    // -- types --
>    struct inode_strong {
> +    int32_t nonce;
>      int32_t caps_wanted;
>      int32_t filelock, nestlock, dftlock;
>      inode_strong() {}
> -    inode_strong(int cw, int dl, int nl, int dftl) :
> -      caps_wanted(cw),
> +    inode_strong(int n, int cw, int dl, int nl, int dftl) :
> +      nonce(n), caps_wanted(cw),
>        filelock(dl), nestlock(nl), dftlock(dftl) { }
>      void encode(bufferlist &bl) const {
> +      ::encode(nonce, bl);
>        ::encode(caps_wanted, bl);
>        ::encode(filelock, bl);
>        ::encode(nestlock, bl);
>        ::encode(dftlock, bl);
>      }
>      void decode(bufferlist::iterator &bl) {
> +      ::decode(nonce, bl);
>        ::decode(caps_wanted, bl);
>        ::decode(filelock, bl);
>        ::decode(nestlock, bl);

This is a wire format change without any versioning to cover it —
we're going to need to at a minimum add feature bits to cover this. It
might be more appropriate to introduce proper versioning at the same
time, though. You should find examples of everything you need in my
recent encoding changes.

The rest looks good.
-Greg

> @@ -208,8 +211,8 @@ public:
>    void add_weak_inode(vinodeno_t i) {
>      weak_inodes.insert(i);
>    }
> -  void add_strong_inode(vinodeno_t i, int cw, int dl, int nl, int dftl) {
> -    strong_inodes[i] = inode_strong(cw, dl, nl, dftl);
> +  void add_strong_inode(vinodeno_t i, int n, int cw, int dl, int nl, int dftl) {
> +    strong_inodes[i] = inode_strong(n, cw, dl, nl, dftl);
>    }
>    void add_inode_locks(CInode *in, __u32 nonce) {
>      ::encode(in->inode.ino, inode_locks);
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack
  2013-03-17 14:51 ` [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack Yan, Zheng
@ 2013-03-20 23:33   ` Gregory Farnum
  2013-03-20 23:40     ` Gregory Farnum
  2013-03-21  6:41     ` Yan, Zheng
  0 siblings, 2 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 23:33 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

This needs to handle versioning the encoding based on peer feature bits too.

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Cache rejoin ack message already encodes inode base, make it also encode
> dirfrag base. This allowes the message to replicate stray dentries like
> MDentryUnlink message. The function will be used by later patch.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/CDir.h                 | 20 +++++++++++++-------
>  src/mds/MDCache.cc             | 20 ++++++++++++++++++--
>  src/messages/MMDSCacheRejoin.h | 12 +++++++++++-
>  3 files changed, 42 insertions(+), 10 deletions(-)
>
> diff --git a/src/mds/CDir.h b/src/mds/CDir.h
> index 79946f1..f4a3a3d 100644
> --- a/src/mds/CDir.h
> +++ b/src/mds/CDir.h
> @@ -437,23 +437,29 @@ private:
>      ::encode(dist, bl);
>    }
>
> -  void encode_replica(int who, bufferlist& bl) {
> -    __u32 nonce = add_replica(who);
> -    ::encode(nonce, bl);
> +  void _encode_base(bufferlist& bl) {
>      ::encode(first, bl);
>      ::encode(fnode, bl);
>      ::encode(dir_rep, bl);
>      ::encode(dir_rep_by, bl);
>    }
> -  void decode_replica(bufferlist::iterator& p) {
> -    __u32 nonce;
> -    ::decode(nonce, p);
> -    replica_nonce = nonce;
> +  void _decode_base(bufferlist::iterator& p) {
>      ::decode(first, p);
>      ::decode(fnode, p);
>      ::decode(dir_rep, p);
>      ::decode(dir_rep_by, p);
>    }
> +  void encode_replica(int who, bufferlist& bl) {
> +    __u32 nonce = add_replica(who);
> +    ::encode(nonce, bl);
> +    _encode_base(bl);
> +  }
> +  void decode_replica(bufferlist::iterator& p) {
> +    __u32 nonce;
> +    ::decode(nonce, p);
> +    replica_nonce = nonce;
> +    _decode_base(p);
> +  }
>
>
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 8ba676e..344777e 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -4510,8 +4510,22 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>      }
>    }
>
> +  // full dirfrags
> +  bufferlist::iterator p = ack->dirfrag_base.begin();
> +  while (!p.end()) {
> +    dirfrag_t df;
> +    bufferlist basebl;
> +    ::decode(df, p);
> +    ::decode(basebl, p);
> +    CDir *dir = get_dirfrag(df);
> +    assert(dir);
> +    bufferlist::iterator q = basebl.begin();
> +    dir->_decode_base(q);
> +    dout(10) << " got dir replica " << *dir << dendl;
> +  }
> +
>    // full inodes
> -  bufferlist::iterator p = ack->inode_base.begin();
> +  p = ack->inode_base.begin();
>    while (!p.end()) {
>      inodeno_t ino;
>      snapid_t last;
> @@ -5178,8 +5192,10 @@ void MDCache::rejoin_send_acks()
>        // dir
>        for (map<int,int>::iterator r = dir->replicas_begin();
>            r != dir->replicas_end();
> -          ++r)
> +          ++r) {
>         ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
> +       ack[r->first]->add_dirfrag_base(dir);
> +      }
>
>        for (CDir::map_t::iterator q = dir->items.begin();
>            q != dir->items.end();
> diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
> index b88f551..7c37ab4 100644
> --- a/src/messages/MMDSCacheRejoin.h
> +++ b/src/messages/MMDSCacheRejoin.h
> @@ -20,6 +20,7 @@
>  #include "include/types.h"
>
>  #include "mds/CInode.h"
> +#include "mds/CDir.h"
>
>  // sent from replica to auth
>
> @@ -169,6 +170,7 @@ class MMDSCacheRejoin : public Message {
>    // full
>    bufferlist inode_base;
>    bufferlist inode_locks;
> +  bufferlist dirfrag_base;
>
>    // authpins, xlocks
>    struct slave_reqid {
> @@ -258,7 +260,13 @@ public:
>    void add_strong_dirfrag(dirfrag_t df, int n, int dr) {
>      strong_dirfrags[df] = dirfrag_strong(n, dr);
>    }
> -
> +  void add_dirfrag_base(CDir *dir) {
> +    ::encode(dir->dirfrag(), dirfrag_base);
> +    bufferlist bl;
> +    dir->_encode_base(bl);
> +    ::encode(bl, dirfrag_base);
> +  }

We are guilty of doing this in other places, but we should avoid
implicit encodings like this one, especially when the decode happens
somewhere else like it does here. We can make a vector dirfrag_bases
and add to that, and then encode and decode it along with the rest of
the message — would that work for your purposes?
-Greg

> +
>    // dentries
>    void add_weak_dirfrag(dirfrag_t df) {
>      weak_dirfrags.insert(df);
> @@ -294,6 +302,7 @@ public:
>      ::encode(wrlocked_inodes, payload);
>      ::encode(cap_export_bl, payload);
>      ::encode(strong_dirfrags, payload);
> +    ::encode(dirfrag_base, payload);
>      ::encode(weak, payload);
>      ::encode(weak_dirfrags, payload);
>      ::encode(weak_inodes, payload);
> @@ -319,6 +328,7 @@ public:
>        ::decode(cap_export_paths, q);
>      }
>      ::decode(strong_dirfrags, p);
> +    ::decode(dirfrag_base, p);
>      ::decode(weak, p);
>      ::decode(weak_dirfrags, p);
>      ::decode(weak_inodes, p);
> --
> 1.7.11.7
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong
  2013-03-20 23:26   ` Gregory Farnum
@ 2013-03-20 23:36     ` Sage Weil
  0 siblings, 0 replies; 117+ messages in thread
From: Sage Weil @ 2013-03-20 23:36 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: Yan, Zheng, ceph-devel

On Wed, 20 Mar 2013, Gregory Farnum wrote:
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> > From: "Yan, Zheng" <zheng.z.yan@intel.com>
> >
> > So the recovering MDS can properly handle cache expire messages.
> > Also increase the nonce value when sending the cache rejoin acks.
> >
> > Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> > ---
> >  src/mds/MDCache.cc             | 35 +++++++++++++++++++++++------------
> >  src/messages/MMDSCacheRejoin.h | 11 +++++++----
> >  2 files changed, 30 insertions(+), 16 deletions(-)
> >
> > diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> > index 008a8a2..8ba676e 100644
> > --- a/src/mds/MDCache.cc
> > +++ b/src/mds/MDCache.cc
> > @@ -3538,6 +3538,7 @@ void MDCache::rejoin_send_rejoins()
> >        if (p->first == 0 && root) {
> >         p->second->add_weak_inode(root->vino());
> >         p->second->add_strong_inode(root->vino(),
> > +                                   root->get_replica_nonce(),
> >                                     root->get_caps_wanted(),
> >                                     root->filelock.get_state(),
> >                                     root->nestlock.get_state(),
> > @@ -3551,6 +3552,7 @@ void MDCache::rejoin_send_rejoins()
> >        if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
> >         p->second->add_weak_inode(in->vino());
> >         p->second->add_strong_inode(in->vino(),
> > +                                   in->get_replica_nonce(),
> >                                     in->get_caps_wanted(),
> >                                     in->filelock.get_state(),
> >                                     in->nestlock.get_state(),
> > @@ -3709,6 +3711,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
> >         CInode *in = dnl->get_inode();
> >         dout(15) << " add_strong_inode " << *in << dendl;
> >         rejoin->add_strong_inode(in->vino(),
> > +                                in->get_replica_nonce(),
> >                                  in->get_caps_wanted(),
> >                                  in->filelock.get_state(),
> >                                  in->nestlock.get_state(),
> > @@ -4248,7 +4251,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
> >         dir = rejoin_invent_dirfrag(p->first);
> >      }
> >      if (dir) {
> > -      dir->add_replica(from);
> > +      dir->add_replica(from, p->second.nonce);
> >        dir->dir_rep = p->second.dir_rep;
> >      } else {
> >        dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
> > @@ -4263,7 +4266,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
> >           dir = rejoin_invent_dirfrag(p->first);
> >         else
> >           dout(10) << " have(approx) " << *dir << dendl;
> > -       dir->add_replica(from);
> > +       dir->add_replica(from, p->second.nonce);
> >         dir->dir_rep = p->second.dir_rep;
> >        }
> >        refragged = true;
> > @@ -4327,7 +4330,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
> >         mdr->locks.insert(&dn->lock);
> >        }
> >
> > -      dn->add_replica(from);
> > +      dn->add_replica(from, q->second.nonce);
> >        dout(10) << " have " << *dn << dendl;
> >
> >        // inode?
> > @@ -4412,7 +4415,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
> >           dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
> >         }
> >
> > -       in->add_replica(from);
> > +       in->add_replica(from, p->second.nonce);
> >         dout(10) << " have " << *in << dendl;
> >        }
> >      }
> > @@ -5176,7 +5179,7 @@ void MDCache::rejoin_send_acks()
> >        for (map<int,int>::iterator r = dir->replicas_begin();
> >            r != dir->replicas_end();
> >            ++r)
> > -       ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second, dir->dir_rep);
> > +       ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
> >
> >        for (CDir::map_t::iterator q = dir->items.begin();
> >            q != dir->items.end();
> > @@ -5192,7 +5195,7 @@ void MDCache::rejoin_send_acks()
> >                                            dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
> >                                            dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
> >                                            dnl->is_remote() ? dnl->get_remote_d_type():0,
> > -                                          r->second,
> > +                                          ++r->second,
> >                                            dn->lock.get_replica_state());
> >
> >         if (!dnl->is_primary())
> > @@ -5205,7 +5208,7 @@ void MDCache::rejoin_send_acks()
> >              r != in->replicas_end();
> >              ++r) {
> >           ack[r->first]->add_inode_base(in);
> > -         ack[r->first]->add_inode_locks(in, r->second);
> > +         ack[r->first]->add_inode_locks(in, ++r->second);
> >         }
> >
> >         // subdirs in this subtree?
> > @@ -5220,14 +5223,14 @@ void MDCache::rejoin_send_acks()
> >          r != root->replicas_end();
> >          ++r) {
> >        ack[r->first]->add_inode_base(root);
> > -      ack[r->first]->add_inode_locks(root, r->second);
> > +      ack[r->first]->add_inode_locks(root, ++r->second);
> >      }
> >    if (myin)
> >      for (map<int,int>::iterator r = myin->replicas_begin();
> >          r != myin->replicas_end();
> >          ++r) {
> >        ack[r->first]->add_inode_base(myin);
> > -      ack[r->first]->add_inode_locks(myin, r->second);
> > +      ack[r->first]->add_inode_locks(myin, ++r->second);
> >      }
> >
> >    // include inode base for any inodes whose scatterlocks may have updated
> > @@ -5728,6 +5731,12 @@ void MDCache::send_expire_messages(map<int, MCacheExpire*>& expiremap)
> >    for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
> >         it != expiremap.end();
> >         ++it) {
> > +    if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
> > +       (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
> > +        rejoin_sent.count(it->first) == 0)) {
> > +      it->second->put();
> > +      continue;
> > +    }
> >      dout(7) << "sending cache_expire to " << it->first << dendl;
> >      mds->send_message_mds(it->second, it->first);
> >    }
> > @@ -9640,9 +9649,11 @@ void MDCache::handle_dentry_link(MDentryLink *m)
> >      CInode *in = add_replica_inode(p, NULL, finished);
> >      assert(in->get_num_ref() == 0);
> >      assert(in->get_parent_dn() == NULL);
> > -    MCacheExpire* expire = new MCacheExpire(mds->get_nodeid());
> > -    expire->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
> > -    mds->send_message_mds(expire, m->get_source().num());
> > +    map<int, MCacheExpire*> expiremap;
> > +    int from = m->get_source().num();
> > +    expiremap[from] = new MCacheExpire(mds->get_nodeid());
> > +    expiremap[from]->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
> > +    send_expire_messages(expiremap);
> >      remove_inode(in);
> >    }
> >
> > diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
> > index 825400d..b88f551 100644
> > --- a/src/messages/MMDSCacheRejoin.h
> > +++ b/src/messages/MMDSCacheRejoin.h
> > @@ -43,19 +43,22 @@ class MMDSCacheRejoin : public Message {
> >
> >    // -- types --
> >    struct inode_strong {
> > +    int32_t nonce;
> >      int32_t caps_wanted;
> >      int32_t filelock, nestlock, dftlock;
> >      inode_strong() {}
> > -    inode_strong(int cw, int dl, int nl, int dftl) :
> > -      caps_wanted(cw),
> > +    inode_strong(int n, int cw, int dl, int nl, int dftl) :
> > +      nonce(n), caps_wanted(cw),
> >        filelock(dl), nestlock(nl), dftlock(dftl) { }
> >      void encode(bufferlist &bl) const {
> > +      ::encode(nonce, bl);
> >        ::encode(caps_wanted, bl);
> >        ::encode(filelock, bl);
> >        ::encode(nestlock, bl);
> >        ::encode(dftlock, bl);
> >      }
> >      void decode(bufferlist::iterator &bl) {
> > +      ::decode(nonce, bl);
> >        ::decode(caps_wanted, bl);
> >        ::decode(filelock, bl);
> >        ::decode(nestlock, bl);
> 
> This is a wire format change without any versioning to cover it ?
> we're going to need to at a minimum add feature bits to cover this. It
> might be more appropriate to introduce proper versioning at the same
> time, though. You should find examples of everything you need in my
> recent encoding changes.

Since we're not too concerned about rolling upgrades for the mds cluster, 
we could just bump the CEPH_MDSC_PROTOCOL instead of spending another 
feature bit (we're halfway through them!).

...but we should also move to the new encoding macros opportunistically :)

s

> 
> The rest looks good.
> -Greg
> 
> > @@ -208,8 +211,8 @@ public:
> >    void add_weak_inode(vinodeno_t i) {
> >      weak_inodes.insert(i);
> >    }
> > -  void add_strong_inode(vinodeno_t i, int cw, int dl, int nl, int dftl) {
> > -    strong_inodes[i] = inode_strong(cw, dl, nl, dftl);
> > +  void add_strong_inode(vinodeno_t i, int n, int cw, int dl, int nl, int dftl) {
> > +    strong_inodes[i] = inode_strong(n, cw, dl, nl, dftl);
> >    }
> >    void add_inode_locks(CInode *in, __u32 nonce) {
> >      ::encode(in->inode.ino, inode_locks);
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack
  2013-03-20 23:33   ` Gregory Farnum
@ 2013-03-20 23:40     ` Gregory Farnum
  2013-03-21  6:41     ` Yan, Zheng
  1 sibling, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 23:40 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Wed, Mar 20, 2013 at 4:33 PM, Gregory Farnum <greg@inktank.com> wrote:
> This needs to handle versioning the encoding based on peer feature bits too.
>
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> +  void add_dirfrag_base(CDir *dir) {
>> +    ::encode(dir->dirfrag(), dirfrag_base);
>> +    bufferlist bl;
>> +    dir->_encode_base(bl);
>> +    ::encode(bl, dirfrag_base);
>> +  }
>
> We are guilty of doing this in other places, but we should avoid
> implicit encodings like this one, especially when the decode happens
> somewhere else like it does here. We can make a vector dirfrag_bases
> and add to that, and then encode and decode it along with the rest of
> the message — would that work for your purposes?
> -Greg

Sorry, a vector (called dirfrag_bases) of pair<dirfrag_t, bl> where bl
is the encoded base. Or something like that. :)
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 23/39] mds: reqid for rejoinning authpin/wrlock need to be list
  2013-03-17 14:51 ` [PATCH 23/39] mds: reqid for rejoinning authpin/wrlock need to be list Yan, Zheng
@ 2013-03-20 23:59   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-20 23:59 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

I think Sage is right, we can just bump the MDS protocol instead of
spending a feature bit on OTW changes — but this is another message we
should update to the new encoding macros while we're making that bump.
The rest looks good!
-Greg

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc             | 78 ++++++++++++++++++++++++------------------
>  src/messages/MMDSCacheRejoin.h | 12 +++----
>  2 files changed, 50 insertions(+), 40 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 38b1fdf..f4622de 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -4305,16 +4305,19 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>        // dn auth_pin?
>        if (strong->authpinned_dentries.count(p->first) &&
>           strong->authpinned_dentries[p->first].count(q->first)) {
> -       MMDSCacheRejoin::slave_reqid r = strong->authpinned_dentries[p->first][q->first];
> -       dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
> -
> -       // get/create slave mdrequest
> -       MDRequest *mdr;
> -       if (have_request(r.reqid))
> -         mdr = request_get(r.reqid);
> -       else
> -         mdr = request_start_slave(r.reqid, r.attempt, from);
> -       mdr->auth_pin(dn);
> +       for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_dentries[p->first][q->first].begin();
> +            r != strong->authpinned_dentries[p->first][q->first].end();
> +            ++r) {
> +         dout(10) << " dn authpin by " << *r << " on " << *dn << dendl;
> +
> +         // get/create slave mdrequest
> +         MDRequest *mdr;
> +         if (have_request(r->reqid))
> +           mdr = request_get(r->reqid);
> +         else
> +           mdr = request_start_slave(r->reqid, r->attempt, from);
> +         mdr->auth_pin(dn);
> +       }
>        }
>
>        // dn xlock?
> @@ -4389,22 +4392,25 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>
>      // auth pin?
>      if (strong->authpinned_inodes.count(in->vino())) {
> -      MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
> -      dout(10) << " inode authpin by " << r << " on " << *in << dendl;
> +      for (list<MMDSCacheRejoin::slave_reqid>::iterator r = strong->authpinned_inodes[in->vino()].begin();
> +          r != strong->authpinned_inodes[in->vino()].end();
> +          ++r) {
> +       dout(10) << " inode authpin by " << *r << " on " << *in << dendl;
>
> -      // get/create slave mdrequest
> -      MDRequest *mdr;
> -      if (have_request(r.reqid))
> -       mdr = request_get(r.reqid);
> -      else
> -       mdr = request_start_slave(r.reqid, r.attempt, from);
> -      if (strong->frozen_authpin_inodes.count(in->vino())) {
> -       assert(!in->get_num_auth_pins());
> -       mdr->freeze_auth_pin(in);
> -      } else {
> -       assert(!in->is_frozen_auth_pin());
> +       // get/create slave mdrequest
> +       MDRequest *mdr;
> +       if (have_request(r->reqid))
> +         mdr = request_get(r->reqid);
> +       else
> +         mdr = request_start_slave(r->reqid, r->attempt, from);
> +       if (strong->frozen_authpin_inodes.count(in->vino())) {
> +         assert(!in->get_num_auth_pins());
> +         mdr->freeze_auth_pin(in);
> +       } else {
> +         assert(!in->is_frozen_auth_pin());
> +       }
> +       mdr->auth_pin(in);
>        }
> -      mdr->auth_pin(in);
>      }
>      // xlock(s)?
>      if (strong->xlocked_inodes.count(in->vino())) {
> @@ -4427,19 +4433,23 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>      }
>      // wrlock(s)?
>      if (strong->wrlocked_inodes.count(in->vino())) {
> -      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->wrlocked_inodes[in->vino()].begin();
> +      for (map<int, list<MMDSCacheRejoin::slave_reqid> >::iterator q = strong->wrlocked_inodes[in->vino()].begin();
>            q != strong->wrlocked_inodes[in->vino()].end();
>            ++q) {
>         SimpleLock *lock = in->get_lock(q->first);
> -       dout(10) << " inode wrlock by " << q->second << " on " << *lock << " on " << *in << dendl;
> -       MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
> -       assert(mdr->is_auth_pinned(in));
> -       lock->set_state(LOCK_LOCK);
> -       if (lock == &in->filelock)
> -         in->loner_cap = -1;
> -       lock->get_wrlock(true);
> -       mdr->wrlocks.insert(lock);
> -       mdr->locks.insert(lock);
> +       for (list<MMDSCacheRejoin::slave_reqid>::iterator r = q->second.begin();
> +            r != q->second.end();
> +            ++r) {
> +         dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl;
> +         MDRequest *mdr = request_get(r->reqid);  // should have this from auth_pin above.
> +         assert(mdr->is_auth_pinned(in));
> +         lock->set_state(LOCK_MIX);
> +         if (lock == &in->filelock)
> +           in->loner_cap = -1;
> +         lock->get_wrlock(true);
> +         mdr->wrlocks.insert(lock);
> +         mdr->locks.insert(lock);
> +       }
>        }
>      }
>    }
> diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
> index 7c37ab4..f29e8f5 100644
> --- a/src/messages/MMDSCacheRejoin.h
> +++ b/src/messages/MMDSCacheRejoin.h
> @@ -188,11 +188,11 @@ class MMDSCacheRejoin : public Message {
>        ::decode(attempt, bl);
>      }
>    };
> -  map<vinodeno_t, slave_reqid> authpinned_inodes;
> +  map<vinodeno_t, list<slave_reqid> > authpinned_inodes;
>    map<vinodeno_t, slave_reqid> frozen_authpin_inodes;
>    map<vinodeno_t, map<__s32, slave_reqid> > xlocked_inodes;
> -  map<vinodeno_t, map<__s32, slave_reqid> > wrlocked_inodes;
> -  map<dirfrag_t, map<string_snap_t, slave_reqid> > authpinned_dentries;
> +  map<vinodeno_t, map<__s32, list<slave_reqid> > > wrlocked_inodes;
> +  map<dirfrag_t, map<string_snap_t, list<slave_reqid> > > authpinned_dentries;
>    map<dirfrag_t, map<string_snap_t, slave_reqid> > xlocked_dentries;
>
>    MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {}
> @@ -232,7 +232,7 @@ public:
>      ::encode(bl, inode_base);
>    }
>    void add_inode_authpin(vinodeno_t ino, const metareqid_t& ri, __u32 attempt) {
> -    authpinned_inodes[ino] = slave_reqid(ri, attempt);
> +    authpinned_inodes[ino].push_back(slave_reqid(ri, attempt));
>    }
>    void add_inode_frozen_authpin(vinodeno_t ino, const metareqid_t& ri, __u32 attempt) {
>      frozen_authpin_inodes[ino] = slave_reqid(ri, attempt);
> @@ -241,7 +241,7 @@ public:
>      xlocked_inodes[ino][lt] = slave_reqid(ri, attempt);
>    }
>    void add_inode_wrlock(vinodeno_t ino, int lt, const metareqid_t& ri, __u32 attempt) {
> -    wrlocked_inodes[ino][lt] = slave_reqid(ri, attempt);
> +    wrlocked_inodes[ino][lt].push_back(slave_reqid(ri, attempt));
>    }
>
>    void add_scatterlock_state(CInode *in) {
> @@ -282,7 +282,7 @@ public:
>    }
>    void add_dentry_authpin(dirfrag_t df, const string& dname, snapid_t last,
>                           const metareqid_t& ri, __u32 attempt) {
> -    authpinned_dentries[df][string_snap_t(dname, last)] = slave_reqid(ri, attempt);
> +    authpinned_dentries[df][string_snap_t(dname, last)].push_back(slave_reqid(ri, attempt));
>    }
>    void add_dentry_xlock(dirfrag_t df, const string& dname, snapid_t last,
>                         const metareqid_t& ri, __u32 attempt) {
> --
> 1.7.11.7
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 24/39] mds: take object's versionlock when rejoinning xlock
  2013-03-17 14:51 ` [PATCH 24/39] mds: take object's versionlock when rejoinning xlock Yan, Zheng
@ 2013-03-21  0:37   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  0:37 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 12 ++++++++++++
>  1 file changed, 12 insertions(+)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index f4622de..194f983 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -4327,6 +4327,12 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>         dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
>         MDRequest *mdr = request_get(r.reqid);  // should have this from auth_pin above.
>         assert(mdr->is_auth_pinned(dn));
> +       if (!mdr->xlocks.count(&dn->versionlock)) {
> +         assert(dn->versionlock.can_xlock_local());
> +         dn->versionlock.get_xlock(mdr, mdr->get_client());
> +         mdr->xlocks.insert(&dn->versionlock);
> +         mdr->locks.insert(&dn->versionlock);
> +       }
>         if (dn->lock.is_stable())
>           dn->auth_pin(&dn->lock);
>         dn->lock.set_state(LOCK_XLOCK);
> @@ -4421,6 +4427,12 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>         dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
>         MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
>         assert(mdr->is_auth_pinned(in));
> +       if (!mdr->xlocks.count(&in->versionlock)) {
> +         assert(in->versionlock.can_xlock_local());
> +         in->versionlock.get_xlock(mdr, mdr->get_client());
> +         mdr->xlocks.insert(&in->versionlock);
> +         mdr->locks.insert(&in->versionlock);
> +       }
>         if (lock->is_stable())
>           in->auth_pin(lock);
>         lock->set_state(LOCK_XLOCK);
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 25/39] mds: share inode max size after MDS recovers
  2013-03-17 14:51 ` [PATCH 25/39] mds: share inode max size after MDS recovers Yan, Zheng
@ 2013-03-21  0:45   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  0:45 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> The MDS may crash after journaling the new max size, but before sending
> the new max size to the client. Later when the MDS recovers, the client
> re-requests the new max size, but the MDS finds max size unchanged. So
> the client waits for the new max size forever. This issue can be avoided
> by checking client cap's last_sent, share inode max size if it is zero.
> (reconnected cap's last_sent is zero)
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Locker.cc  | 18 ++++++++++++++----
>  src/mds/Locker.h   |  2 +-
>  src/mds/MDCache.cc |  2 ++
>  3 files changed, 17 insertions(+), 5 deletions(-)
>
> diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
> index 0055a19..4d45f99 100644
> --- a/src/mds/Locker.cc
> +++ b/src/mds/Locker.cc
> @@ -2089,7 +2089,7 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
>  }
>
>
> -void Locker::share_inode_max_size(CInode *in)
> +void Locker::share_inode_max_size(CInode *in, Capability *only_cap)
>  {
>    /*
>     * only share if currently issued a WR cap.  if client doesn't have it,
> @@ -2097,9 +2097,12 @@ void Locker::share_inode_max_size(CInode *in)
>     * the cap later.
>     */
>    dout(10) << "share_inode_max_size on " << *in << dendl;
> -  for (map<client_t,Capability*>::iterator it = in->client_caps.begin();
> -       it != in->client_caps.end();
> -       ++it) {
> +  map<client_t, Capability*>::iterator it;
> +  if (only_cap)
> +    it = in->client_caps.find(only_cap->get_client());
> +  else
> +    it = in->client_caps.begin();
> +  for (; it != in->client_caps.end(); ++it) {
>      const client_t client = it->first;
>      Capability *cap = it->second;
>      if (cap->is_suppress())
> @@ -2115,6 +2118,8 @@ void Locker::share_inode_max_size(CInode *in)
>        in->encode_cap_message(m, cap);
>        mds->send_message_client_counted(m, client);
>      }
> +    if (only_cap)
> +      break;
>    }
>  }
>
> @@ -2398,6 +2403,11 @@ void Locker::handle_client_caps(MClientCaps *m)
>        bool did_issue = eval(in, CEPH_CAP_LOCKS);
>        if (!did_issue && (cap->wanted() & ~cap->pending()))
>         issue_caps(in, cap);
> +      if (cap->get_last_seq() == 0 &&
> +         (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER))) {
> +       cap->issue_norevoke(cap->issued());
> +       share_inode_max_size(in, cap);
> +      }
>      }
>    }
>
> diff --git a/src/mds/Locker.h b/src/mds/Locker.h
> index 3f79996..d98104f 100644
> --- a/src/mds/Locker.h
> +++ b/src/mds/Locker.h
> @@ -276,7 +276,7 @@ public:
>    void calc_new_client_ranges(CInode *in, uint64_t size, map<client_t, client_writeable_range_t>& new_ranges);
>    bool check_inode_max_size(CInode *in, bool force_wrlock=false, bool update_size=false, uint64_t newsize=0,
>                             utime_t mtime=utime_t());
> -  void share_inode_max_size(CInode *in);
> +  void share_inode_max_size(CInode *in, Capability *only_cap=0);
>
>  private:
>    friend class C_MDL_CheckMaxSize;
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 194f983..459b400 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -5073,6 +5073,8 @@ void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap)
>    SnapRealm *realm = in->find_snaprealm();
>    if (realm->have_past_parents_open()) {
>      dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
> +    if (cap->get_last_seq() == 0)
> +      cap->issue_norevoke(cap->issued()); // reconnected cap
>      cap->set_last_issue();
>      MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT,
>                                         in->ino(),
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 26/39] mds: issue caps when lock state in replica become SYNC
  2013-03-17 14:51 ` [PATCH 26/39] mds: issue caps when lock state in replica become SYNC Yan, Zheng
@ 2013-03-21  0:52   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  0:52 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> because client can request READ caps from non-auth MDS.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Locker.cc | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
> index 4d45f99..28920d4 100644
> --- a/src/mds/Locker.cc
> +++ b/src/mds/Locker.cc
> @@ -4403,6 +4403,8 @@ void Locker::handle_file_lock(ScatterLock *lock, MLock *m)
>      lock->set_state(LOCK_SYNC);
>
>      lock->get_rdlock();
> +    if (caps)
> +      issue_caps(in);
>      lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
>      lock->put_rdlock();
>      break;
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state.
  2013-03-20 18:40   ` Greg Farnum
@ 2013-03-21  2:22     ` Yan, Zheng
  2013-03-21 21:43       ` Gregory Farnum
  0 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-21  2:22 UTC (permalink / raw)
  To: Greg Farnum; +Cc: ceph-devel, sage

On 03/21/2013 02:40 AM, Greg Farnum wrote:
> The idea of this patch makes sense, but I'm not sure if we guarantee that each daemon sees every map update — if they don't then if an MDS misses the map moving an MDS into CLIENTREPLAY then they won't process them as having recovered on the next map. Sage or Joao, what are the guarantees subscription provides?  
> -Greg

See MDS::active_start(), it also kicks clientreplay waiters. And I will fix the 'clientreply' typo in my git tree.

Thanks
Yan, Zheng

> 
> Software Engineer #42 @ http://inktank.com | http://ceph.com
> 
> 
> On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
> 
>> From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>>  
>> MDS in clientreply state already start servering requests. It also
>> make MDS::handle_mds_recovery() and MDS::recovery_done() match.
>>  
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>> ---
>> src/mds/MDS.cc (http://MDS.cc) | 2 ++
>> 1 file changed, 2 insertions(+)
>>  
>> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
>> index 282fa64..b91dcbd 100644
>> --- a/src/mds/MDS.cc (http://MDS.cc)
>> +++ b/src/mds/MDS.cc (http://MDS.cc)
>> @@ -1032,7 +1032,9 @@ void MDS::handle_mds_map(MMDSMap *m)
>>  
>> set<int> oldactive, active;
>> oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
>> + oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
>> mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
>> + mdsmap->get_mds_set(active, MDSMap::STATE_CLIENTREPLAY);
>> for (set<int>::iterator p = active.begin(); p != active.end(); ++p)  
>> if (*p != whoami && // not me
>> oldactive.count(*p) == 0) // newly so?
>> --  
>> 1.7.11.7
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 09/39] mds: defer eval gather locks when removing replica
  2013-03-20 19:36   ` Greg Farnum
@ 2013-03-21  2:29     ` Yan, Zheng
  0 siblings, 0 replies; 117+ messages in thread
From: Yan, Zheng @ 2013-03-21  2:29 UTC (permalink / raw)
  To: Greg Farnum; +Cc: ceph-devel, sage

Will update my git tree.

Thanks
Yan, Zheng

On 03/21/2013 03:36 AM, Greg Farnum wrote:
> On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> Locks' states should not change between composing the cache rejoin ack
>> messages and sending the message. If Locker::eval_gather() is called
>> in MDCache::{inode,dentry}_remove_replica(), it may wake requests and
>> change locks' states.
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>> ---
>> src/mds/MDCache.cc (http://MDCache.cc) | 51 ++++++++++++++++++++++++++++++---------------------
>> src/mds/MDCache.h | 8 +++++---
>> 2 files changed, 35 insertions(+), 24 deletions(-)
>>
>> diff --git a/src/mds/MDCache.cc (http://MDCache.cc) b/src/mds/MDCache.cc (http://MDCache.cc)
>> index 19dc60b..0f6b842 100644
>> --- a/src/mds/MDCache.cc (http://MDCache.cc)
>> +++ b/src/mds/MDCache.cc (http://MDCache.cc)
>> @@ -3729,6 +3729,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> // possible response(s)
>> MMDSCacheRejoin *ack = 0; // if survivor
>> set<vinodeno_t> acked_inodes; // if survivor
>> + set<SimpleLock *> gather_locks; // if survivor
>> bool survivor = false; // am i a survivor?
>>
>> if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
>> @@ -3851,7 +3852,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> assert(dnl->is_primary());
>>
>> if (survivor && dn->is_replica(from)) 
>> - dentry_remove_replica(dn, from); // this induces a lock gather completion
>> + dentry_remove_replica(dn, from, gather_locks); // this induces a lock gather completion
> 
> This comment is no longer accurate :) 
>> int dnonce = dn->add_replica(from);
>> dout(10) << " have " << *dn << dendl;
>> if (ack) 
>> @@ -3864,7 +3865,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> assert(in);
>>
>> if (survivor && in->is_replica(from)) 
>> - inode_remove_replica(in, from);
>> + inode_remove_replica(in, from, gather_locks);
>> int inonce = in->add_replica(from);
>> dout(10) << " have " << *in << dendl;
>>
>> @@ -3887,7 +3888,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> CInode *in = get_inode(*p);
>> assert(in); // hmm fixme wrt stray?
>> if (survivor && in->is_replica(from)) 
>> - inode_remove_replica(in, from); // this induces a lock gather completion
>> + inode_remove_replica(in, from, gather_locks); // this induces a lock gather completion
> 
> Same here. 
> 
> Other than those, looks good.
> -Greg
> Software Engineer #42 @ http://inktank.com | http://ceph.com
> 
> 
>> int inonce = in->add_replica(from);
>> dout(10) << " have base " << *in << dendl;
>>
>> @@ -3909,8 +3910,11 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>> ack->add_inode_base(in);
>> }
>>
>> - rejoin_scour_survivor_replicas(from, ack, acked_inodes);
>> + rejoin_scour_survivor_replicas(from, ack, gather_locks, acked_inodes);
>> mds->send_message(ack, weak->get_connection());
>> +
>> + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p)
>> + mds->locker->eval_gather(*p);
>> } else {
>> // done?
>> assert(rejoin_gather.count(from));
>> @@ -4055,7 +4059,9 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
>> * all validated replicas are acked with a strong nonce, etc. if that isn't in the
>> * ack, the replica dne, and we can remove it from our replica maps.
>> */
>> -void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes)
>> +void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
>> + set<SimpleLock *>& gather_locks,
>> + set<vinodeno_t>& acked_inodes)
>> {
>> dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
>>
>> @@ -4070,7 +4076,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set
>> if (in->is_auth() &&
>> in->is_replica(from) &&
>> acked_inodes.count(p->second->vino()) == 0) {
>> - inode_remove_replica(in, from);
>> + inode_remove_replica(in, from, gather_locks);
>> dout(10) << " rem " << *in << dendl;
>> }
>>
>> @@ -4099,7 +4105,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set
>> if (dn->is_replica(from) &&
>> (ack->strong_dentries.count(dir->dirfrag()) == 0 ||
>> ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) {
>> - dentry_remove_replica(dn, from);
>> + dentry_remove_replica(dn, from, gather_locks);
>> dout(10) << " rem " << *dn << dendl;
>> }
>> }
>> @@ -6189,6 +6195,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
>> return;
>> }
>>
>> + set<SimpleLock *> gather_locks;
>> // loop over realms
>> for (map<dirfrag_t,MCacheExpire::realm>::iterator p = m->realms.begin();
>> p != m->realms.end();
>> @@ -6255,7 +6262,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
>> // remove from our cached_by
>> dout(7) << " inode expire on " << *in << " from mds." << from 
>> << " cached_by was " << in->get_replicas() << dendl;
>> - inode_remove_replica(in, from);
>> + inode_remove_replica(in, from, gather_locks);
>> } 
>> else {
>> // this is an old nonce, ignore expire.
>> @@ -6332,7 +6339,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
>>
>> if (nonce == dn->get_replica_nonce(from)) {
>> dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
>> - dentry_remove_replica(dn, from);
>> + dentry_remove_replica(dn, from, gather_locks);
>> } 
>> else {
>> dout(7) << " dentry_expire on " << *dn << " from mds." << from
>> @@ -6343,6 +6350,8 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
>> }
>> }
>>
>> + for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p)
>> + mds->locker->eval_gather(*p);
>>
>> // done
>> m->put();
>> @@ -6368,35 +6377,35 @@ void MDCache::discard_delayed_expire(CDir *dir)
>> delayed_expire.erase(dir); 
>> }
>>
>> -void MDCache::inode_remove_replica(CInode *in, int from)
>> +void MDCache::inode_remove_replica(CInode *in, int from, set<SimpleLock *>& gather_locks)
>> {
>> in->remove_replica(from);
>> in->mds_caps_wanted.erase(from);
>>
>> // note: this code calls _eval more often than it needs to!
>> // fix lock
>> - if (in->authlock.remove_replica(from)) mds->locker->eval_gather(&in->authlock);
>> - if (in->linklock.remove_replica(from)) mds->locker->eval_gather(&in->linklock);
>> - if (in->dirfragtreelock.remove_replica(from)) mds->locker->eval_gather(&in->dirfragtreelock);
>> - if (in->filelock.remove_replica(from)) mds->locker->eval_gather(&in->filelock);
>> - if (in->snaplock.remove_replica(from)) mds->locker->eval_gather(&in->snaplock);
>> - if (in->xattrlock.remove_replica(from)) mds->locker->eval_gather(&in->xattrlock);
>> + if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
>> + if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
>> + if (in->dirfragtreelock.remove_replica(from)) gather_locks.insert(&in->dirfragtreelock);
>> + if (in->filelock.remove_replica(from)) gather_locks.insert(&in->filelock);
>> + if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
>> + if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
>>
>> - if (in->nestlock.remove_replica(from)) mds->locker->eval_gather(&in->nestlock);
>> - if (in->flocklock.remove_replica(from)) mds->locker->eval_gather(&in->flocklock);
>> - if (in->policylock.remove_replica(from)) mds->locker->eval_gather(&in->policylock);
>> + if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock);
>> + if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
>> + if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
>>
>> // trim?
>> maybe_eval_stray(in);
>> }
>>
>> -void MDCache::dentry_remove_replica(CDentry *dn, int from)
>> +void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& gather_locks)
>> {
>> dn->remove_replica(from);
>>
>> // fix lock
>> if (dn->lock.remove_replica(from))
>> - mds->locker->eval_gather(&dn->lock);
>> + gather_locks.insert(&dn->lock);
>>
>> CDentry::linkage_t *dnl = dn->get_projected_linkage();
>> if (dnl->is_primary())
>> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
>> index f07ea74..a9f05c6 100644
>> --- a/src/mds/MDCache.h
>> +++ b/src/mds/MDCache.h
>> @@ -406,7 +406,9 @@ protected:
>> CDir* rejoin_invent_dirfrag(dirfrag_t df);
>> bool rejoin_fetch_dirfrags(MMDSCacheRejoin *m);
>> void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
>> - void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, set<vinodeno_t>& acked_inodes);
>> + void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
>> + set<SimpleLock *>& gather_locks,
>> + set<vinodeno_t>& acked_inodes);
>> void handle_cache_rejoin_ack(MMDSCacheRejoin *m);
>> void handle_cache_rejoin_purge(MMDSCacheRejoin *m);
>> void handle_cache_rejoin_missing(MMDSCacheRejoin *m);
>> @@ -607,8 +609,8 @@ public:
>> }
>> protected:
>>
>> - void inode_remove_replica(CInode *in, int rep);
>> - void dentry_remove_replica(CDentry *dn, int rep);
>> + void inode_remove_replica(CInode *in, int rep, set<SimpleLock *>& gather_locks);
>> + void dentry_remove_replica(CDentry *dn, int rep, set<SimpleLock *>& gather_locks);
>>
>> void rename_file(CDentry *srcdn, CDentry *destdn);
>>
>> -- 
>> 1.7.11.7
> 
> 


^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 11/39] mds: don't delay processing replica buffer in slave request
  2013-03-20 21:19   ` Greg Farnum
@ 2013-03-21  2:38     ` Yan, Zheng
  2013-03-21  4:15       ` Sage Weil
  0 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-21  2:38 UTC (permalink / raw)
  To: Greg Farnum; +Cc: ceph-devel, sage

On 03/21/2013 05:19 AM, Greg Farnum wrote:
> On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> Replicated objects need to be added into the cache immediately
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> Why do we need to add them right away? Shouldn't we have a journaled replica if we need it?
> -Greg

The issue I encountered is lock action message received, but replicated objects wasn't in the
cache because slave request was delayed.

Thanks
Yan, Zheng


> 
> Software Engineer #42 @ http://inktank.com | http://ceph.com
>> ---
>> src/mds/MDCache.cc | 12 ++++++++++++
>> src/mds/MDCache.h | 2 +-
>> src/mds/MDS.cc | 6 +++---
>> src/mds/Server.cc | 55 +++++++++++++++++++++++++++++++++++++++---------------
>> 4 files changed, 56 insertions(+), 19 deletions(-)
>>
>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> index 0f6b842..b668842 100644
>> --- a/src/mds/MDCache.cc
>> +++ b/src/mds/MDCache.cc
>> @@ -7722,6 +7722,18 @@ void MDCache::_find_ino_dir(inodeno_t ino, Context *fin, bufferlist& bl, int r)
>>
>> /* ---------------------------- */
>>
>> +int MDCache::get_num_client_requests()
>> +{
>> + int count = 0;
>> + for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
>> + p != active_requests.end();
>> + ++p) {
>> + if (p->second->reqid.name.is_client() && !p->second->is_slave())
>> + count++;
>> + }
>> + return count;
>> +}
>> +
>> /* This function takes over the reference to the passed Message */
>> MDRequest *MDCache::request_start(MClientRequest *req)
>> {
>> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
>> index a9f05c6..4634121 100644
>> --- a/src/mds/MDCache.h
>> +++ b/src/mds/MDCache.h
>> @@ -240,7 +240,7 @@ protected:
>> hash_map<metareqid_t, MDRequest*> active_requests; 
>>
>> public:
>> - int get_num_active_requests() { return active_requests.size(); }
>> + int get_num_client_requests();
>>
>> MDRequest* request_start(MClientRequest *req);
>> MDRequest* request_start_slave(metareqid_t rid, __u32 attempt, int by);
>> diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
>> index b91dcbd..e99eecc 100644
>> --- a/src/mds/MDS.cc
>> +++ b/src/mds/MDS.cc
>> @@ -1900,9 +1900,9 @@ bool MDS::_dispatch(Message *m)
>> mdcache->is_open() &&
>> replay_queue.empty() &&
>> want_state == MDSMap::STATE_CLIENTREPLAY) {
>> - dout(10) << " still have " << mdcache->get_num_active_requests()
>> - << " active replay requests" << dendl;
>> - if (mdcache->get_num_active_requests() == 0)
>> + int num_requests = mdcache->get_num_client_requests();
>> + dout(10) << " still have " << num_requests << " active replay requests" << dendl;
>> + if (num_requests == 0)
>> clientreplay_done();
>> }
>>
>> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
>> index 4c4c86b..8e89e4c 100644
>> --- a/src/mds/Server.cc
>> +++ b/src/mds/Server.cc
>> @@ -107,10 +107,8 @@ void Server::dispatch(Message *m)
>> (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
>> (static_cast<MClientRequest*>(m))->is_replay()))) {
>> // replaying!
>> - } else if (mds->is_clientreplay() && m->get_type() == MSG_MDS_SLAVE_REQUEST &&
>> - ((static_cast<MMDSSlaveRequest*>(m))->is_reply() ||
>> - !mds->mdsmap->is_active(m->get_source().num()))) {
>> - // slave reply or the master is also in the clientreplay stage
>> + } else if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
>> + // handle_slave_request() will wait if necessary
>> } else {
>> dout(3) << "not active yet, waiting" << dendl;
>> mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
>> @@ -1291,6 +1289,13 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
>> if (m->is_reply())
>> return handle_slave_request_reply(m);
>>
>> + CDentry *straydn = NULL;
>> + if (m->stray.length() > 0) {
>> + straydn = mdcache->add_replica_stray(m->stray, from);
>> + assert(straydn);
>> + m->stray.clear();
>> + }
>> +
>> // am i a new slave?
>> MDRequest *mdr = NULL;
>> if (mdcache->have_request(m->get_reqid())) {
>> @@ -1326,9 +1331,26 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
>> m->put();
>> return;
>> }
>> - mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m->get_source().num());
>> + mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), from);
>> }
>> assert(mdr->slave_request == 0); // only one at a time, please! 
>> +
>> + if (straydn) {
>> + mdr->pin(straydn);
>> + mdr->straydn = straydn;
>> + }
>> +
>> + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
>> + dout(3) << "not clientreplay|active yet, waiting" << dendl;
>> + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
>> + return;
>> + } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
>> + mdr->locks.empty()) {
>> + dout(3) << "not active yet, waiting" << dendl;
>> + mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
>> + return;
>> + }
>> +
>> mdr->slave_request = m;
>>
>> dispatch_slave_request(mdr);
>> @@ -1339,6 +1361,12 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
>> {
>> int from = m->get_source().num();
>>
>> + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
>> + dout(3) << "not clientreplay|active yet, waiting" << dendl;
>> + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
>> + return;
>> + }
>> +
>> if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
>> metareqid_t r = m->get_reqid();
>> mds->mdcache->committed_master_slave(r, from);
>> @@ -5138,10 +5166,8 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr)
>> dout(10) << " dn " << *dn << dendl;
>> mdr->pin(dn);
>>
>> - assert(mdr->slave_request->stray.length() > 0);
>> - CDentry *straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
>> - assert(straydn);
>> - mdr->pin(straydn);
>> + assert(mdr->straydn);
>> + CDentry *straydn = mdr->straydn;
>> dout(10) << " straydn " << *straydn << dendl;
>>
>> mdr->now = mdr->slave_request->now;
>> @@ -5208,6 +5234,7 @@ void Server::_logged_slave_rmdir(MDRequest *mdr, CDentry *dn, CDentry *straydn)
>> // done.
>> mdr->slave_request->put();
>> mdr->slave_request = 0;
>> + mdr->straydn = 0;
>> }
>>
>> void Server::handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
>> @@ -6460,15 +6487,12 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
>> // stray?
>> bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
>> (srcdnl->is_primary() || destdnl->is_primary()));
>> - CDentry *straydn = 0;
>> - if (destdnl->is_primary() && !linkmerge) {
>> - assert(mdr->slave_request->stray.length() > 0);
>> - straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
>> + CDentry *straydn = mdr->straydn;
>> + if (destdnl->is_primary() && !linkmerge)
>> assert(straydn);
>> - mdr->pin(straydn);
>> - }
>>
>> mdr->now = mdr->slave_request->now;
>> + mdr->more()->srcdn_auth_mds = srcdn->authority().first;
>>
>> // set up commit waiter (early, to clean up any freezing etc we do)
>> if (!mdr->more()->slave_commit)
>> @@ -6651,6 +6675,7 @@ void Server::_logged_slave_rename(MDRequest *mdr,
>> // done.
>> mdr->slave_request->put();
>> mdr->slave_request = 0;
>> + mdr->straydn = 0;
>> }
>>
>> void Server::_commit_slave_rename(MDRequest *mdr, int r,
>> -- 
>> 1.7.11.7
> 
> 
> 


^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 13/39] mds: don't send resolve message between active MDS
  2013-03-20 21:56   ` Gregory Farnum
@ 2013-03-21  2:55     ` Yan, Zheng
  2013-03-21 21:55       ` Gregory Farnum
  0 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-21  2:55 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: ceph-devel, Sage Weil

On 03/21/2013 05:56 AM, Gregory Farnum wrote:
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> When MDS cluster is resolving, current behavior is sending subtree resolve
>> message to all other MDS and waiting for all other MDS' resolve message.
>> The problem is that active MDS can have diffent subtree map due to rename.
>> Besides gathering active MDS's resolve messages are also racy. The only
>> function for these messages is disambiguate other MDS' import. We can
>> replace it by import finish notification.
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>> ---
>>  src/mds/MDCache.cc  | 12 +++++++++---
>>  src/mds/Migrator.cc | 25 +++++++++++++++++++++++--
>>  src/mds/Migrator.h  |  3 ++-
>>  3 files changed, 34 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> index c455a20..73c1d59 100644
>> --- a/src/mds/MDCache.cc
>> +++ b/src/mds/MDCache.cc
>> @@ -2517,7 +2517,8 @@ void MDCache::send_subtree_resolves()
>>         ++p) {
>>      if (*p == mds->whoami)
>>        continue;
>> -    resolves[*p] = new MMDSResolve;
>> +    if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
>> +      resolves[*p] = new MMDSResolve;
>>    }
>>
>>    // known
>> @@ -2837,7 +2838,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
>>           migrator->import_reverse(dir);
>>         } else {
>>           dout(7) << "ambiguous import succeeded on " << *dir << dendl;
>> -         migrator->import_finish(dir);
>> +         migrator->import_finish(dir, true);
>>         }
>>         my_ambiguous_imports.erase(p);  // no longer ambiguous.
>>        }
>> @@ -3432,7 +3433,12 @@ void MDCache::rejoin_send_rejoins()
>>         ++p) {
>>      CDir *dir = p->first;
>>      assert(dir->is_subtree_root());
>> -    assert(!dir->is_ambiguous_dir_auth());
>> +    if (dir->is_ambiguous_dir_auth()) {
>> +      // exporter is recovering, importer is survivor.
> 
> The importer has to be the MDS this code is running on, right?

This code is for bystanders. The exporter is recovering, and its resolve message didn't claim
the subtree. So the export must succeed.

> 
>> +      assert(rejoins.count(dir->authority().first));
>> +      assert(!rejoins.count(dir->authority().second));
>> +      continue;
>> +    }
>>
>>      // my subtree?
>>      if (dir->is_auth())
>> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
>> index 5e53803..833df12 100644
>> --- a/src/mds/Migrator.cc
>> +++ b/src/mds/Migrator.cc
>> @@ -2088,6 +2088,23 @@ void Migrator::import_reverse(CDir *dir)
>>    }
>>  }
>>
>> +void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
>> +{
>> +  dout(7) << "import_notify_finish " << *dir << dendl;
>> +
>> +  for (set<int>::iterator p = import_bystanders[dir].begin();
>> +       p != import_bystanders[dir].end();
>> +       ++p) {
>> +    MExportDirNotify *notify =
>> +      new MExportDirNotify(dir->dirfrag(), false,
>> +                          pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
>> +                          pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
> 
> I don't think this is quite right — we're notifying them that we've
> just finished importing data from somebody, right? And so we know that
> we're the auth node...

Yes. In normal case, exporter notifies the bystanders. But if exporter crashes, the importer notifies
the bystanders after it confirms ambiguous import succeeds.

Thanks
Yan, Zheng

> 
>> +    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
>> +      notify->get_bounds().push_back((*i)->dirfrag());
>> +    mds->send_message_mds(notify, *p);
>> +  }
>> +}
>> +
>>  void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
>>  {
>>    dout(7) << "import_notify_abort " << *dir << dendl;
>> @@ -2183,11 +2200,11 @@ void Migrator::handle_export_finish(MExportDirFinish *m)
>>    CDir *dir = cache->get_dirfrag(m->get_dirfrag());
>>    assert(dir);
>>    dout(7) << "handle_export_finish on " << *dir << dendl;
>> -  import_finish(dir);
>> +  import_finish(dir, false);
>>    m->put();
>>  }
>>
>> -void Migrator::import_finish(CDir *dir)
>> +void Migrator::import_finish(CDir *dir, bool notify)
>>  {
>>    dout(7) << "import_finish on " << *dir << dendl;
>>
>> @@ -2205,6 +2222,10 @@ void Migrator::import_finish(CDir *dir)
>>    // remove pins
>>    set<CDir*> bounds;
>>    cache->get_subtree_bounds(dir, bounds);
>> +
>> +  if (notify)
>> +    import_notify_finish(dir, bounds);
>> +
>>    import_remove_pins(dir, bounds);
>>
>>    map<CInode*, map<client_t,Capability::Export> > cap_imports;
>> diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
>> index 7988f32..2889a74 100644
>> --- a/src/mds/Migrator.h
>> +++ b/src/mds/Migrator.h
>> @@ -273,12 +273,13 @@ protected:
>>    void import_reverse_unfreeze(CDir *dir);
>>    void import_reverse_final(CDir *dir);
>>    void import_notify_abort(CDir *dir, set<CDir*>& bounds);
>> +  void import_notify_finish(CDir *dir, set<CDir*>& bounds);
>>    void import_logged_start(dirfrag_t df, CDir *dir, int from,
>>                            map<client_t,entity_inst_t> &imported_client_map,
>>                            map<client_t,uint64_t>& sseqmap);
>>    void handle_export_finish(MExportDirFinish *m);
>>  public:
>> -  void import_finish(CDir *dir);
>> +  void import_finish(CDir *dir, bool notify);
>>  protected:
>>
>>    void handle_export_caps(MExportCaps *m);
>> --
>> 1.7.11.7
>>

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 27/39] mds: send lock action message when auth MDS is in proper state.
  2013-03-17 14:51 ` [PATCH 27/39] mds: send lock action message when auth MDS is in proper state Yan, Zheng
@ 2013-03-21  3:12   ` Gregory Farnum
  2013-03-21  3:20     ` Yan, Zheng
  0 siblings, 1 reply; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:12 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> For rejoining object, don't send lock ACK message because lock states
> are still uncertain. The lock ACK may confuse object's auth MDS and
> trigger assertion.
>
> If object's auth MDS is not active, just skip sending NUDGE, REQRDLOCK
> and REQSCATTER messages. MDCache::handle_mds_recovery() will take care
> of them.
>
> Also defer caps release message until clientreplay or active
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Locker.cc  | 46 ++++++++++++++++++++++++++++++----------------
>  src/mds/MDCache.cc | 13 +++++++++++--
>  2 files changed, 41 insertions(+), 18 deletions(-)
>
> diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
> index 28920d4..ece39e3 100644
> --- a/src/mds/Locker.cc
> +++ b/src/mds/Locker.cc
> @@ -658,6 +658,13 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<C
>        // replica: tell auth
>        int auth = lock->get_parent()->authority().first;
>
> +      if (lock->get_parent()->is_rejoining() &&
> +         mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
> +       dout(7) << "eval_gather finished gather, but still rejoining "
> +               << *lock->get_parent() << dendl;
> +       return;
> +      }
> +
>        if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
>         switch (lock->get_state()) {
>         case LOCK_SYNC_LOCK:
> @@ -1050,9 +1057,11 @@ bool Locker::_rdlock_kick(SimpleLock *lock, bool as_anon)
>      } else {
>        // request rdlock state change from auth
>        int auth = lock->get_parent()->authority().first;
> -      dout(10) << "requesting rdlock from auth on "
> -              << *lock << " on " << *lock->get_parent() << dendl;
> -      mds->send_message_mds(new MLock(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth);
> +      if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
> +       dout(10) << "requesting rdlock from auth on "
> +                << *lock << " on " << *lock->get_parent() << dendl;
> +       mds->send_message_mds(new MLock(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth);
> +      }
>        return false;
>      }
>    }
> @@ -1272,9 +1281,11 @@ bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait)
>        // replica.
>        // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case).
>        int auth = lock->get_parent()->authority().first;
> -      dout(10) << "requesting scatter from auth on "
> -              << *lock << " on " << *lock->get_parent() << dendl;
> -      mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth);
> +      if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
> +       dout(10) << "requesting scatter from auth on "
> +                << *lock << " on " << *lock->get_parent() << dendl;
> +       mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth);
> +      }
>        break;
>      }
>    }
> @@ -1899,13 +1910,19 @@ void Locker::request_inode_file_caps(CInode *in)
>      }
>
>      int auth = in->authority().first;
> +    if (in->is_rejoining() &&
> +       mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
> +      mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in));
> +      return;
> +    }
> +
>      dout(7) << "request_inode_file_caps " << ccap_string(wanted)
>              << " was " << ccap_string(in->replica_caps_wanted)
>              << " on " << *in << " to mds." << auth << dendl;
>
>      in->replica_caps_wanted = wanted;
>
> -    if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN)
> +    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth))
>        mds->send_message_mds(new MInodeFileCaps(in->ino(), in->replica_caps_wanted),
>                             auth);
>    }
> @@ -1924,14 +1941,6 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m)
>    assert(in);
>    assert(in->is_auth());
>
> -  if (mds->is_rejoin() &&
> -      in->is_rejoining()) {
> -    dout(7) << "handle_inode_file_caps still rejoining " << *in << ", dropping " << *m << dendl;
> -    m->put();
> -    return;
> -  }

This is okay since we catch it in the follow-on functions (I assume
that's why you removed it, to avoid checks at more levels than
necessary), but if you could note that's why in the commit message
it'll prevent anyone else from needing to go check like I did. :)

The code looks good.
Reviewed-by: Greg Farnum <greg@inktank.com>

> -
> -
>    dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl;
>
>    if (m->get_caps())
> @@ -2850,6 +2859,11 @@ void Locker::handle_client_cap_release(MClientCapRelease *m)
>    client_t client = m->get_source().num();
>    dout(10) << "handle_client_cap_release " << *m << dendl;
>
> +  if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
> +    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
> +    return;
> +  }
> +
>    for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p) {
>      inodeno_t ino((uint64_t)p->ino);
>      CInode *in = mdcache->get_inode(ino);
> @@ -3859,7 +3873,7 @@ void Locker::scatter_nudge(ScatterLock *lock, Context *c, bool forcelockchange)
>              << *lock << " on " << *p << dendl;
>      // request unscatter?
>      int auth = lock->get_parent()->authority().first;
> -    if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_ACTIVE)
> +    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth))
>        mds->send_message_mds(new MLock(lock, LOCK_AC_NUDGE, mds->get_nodeid()), auth);
>
>      // wait...
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 459b400..973a4d0 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -3321,8 +3321,10 @@ void MDCache::recalc_auth_bits()
>
>    if (root) {
>      root->inode_auth.first = mds->mdsmap->get_root();
> -    if (mds->whoami != root->inode_auth.first)
> +    if (mds->whoami != root->inode_auth.first) {
>        root->state_clear(CInode::STATE_AUTH);
> +      root->state_set(CInode::STATE_REJOINING);
> +    }
>    }
>
>    set<CInode*> subtree_inodes;
> @@ -3336,8 +3338,10 @@ void MDCache::recalc_auth_bits()
>         ++p) {
>
>      CInode *inode = p->first->get_inode();
> -    if (inode->is_mdsdir() && inode->ino() != MDS_INO_MDSDIR(mds->get_nodeid()))
> +    if (inode->is_mdsdir() && inode->ino() != MDS_INO_MDSDIR(mds->get_nodeid())) {
>        inode->state_clear(CInode::STATE_AUTH);
> +      inode->state_set(CInode::STATE_REJOINING);
> +    }
>
>      list<CDir*> dfq;  // dirfrag queue
>      dfq.push_back(p->first);
> @@ -3542,6 +3546,7 @@ void MDCache::rejoin_send_rejoins()
>                                     root->filelock.get_state(),
>                                     root->nestlock.get_state(),
>                                     root->dirfragtreelock.get_state());
> +       root->state_set(CInode::STATE_REJOINING);
>         if (root->is_dirty_scattered()) {
>           dout(10) << " sending scatterlock state on root " << *root << dendl;
>           p->second->add_scatterlock_state(root);
> @@ -3555,6 +3560,7 @@ void MDCache::rejoin_send_rejoins()
>                                     in->filelock.get_state(),
>                                     in->nestlock.get_state(),
>                                     in->dirfragtreelock.get_state());
> +       in->state_set(CInode::STATE_REJOINING);
>        }
>      }
>    }
> @@ -3694,6 +3700,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
>      // STRONG
>      dout(15) << " add_strong_dirfrag " << *dir << dendl;
>      rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
> +    dir->state_set(CDir::STATE_REJOINING);
>
>      for (CDir::map_t::iterator p = dir->items.begin();
>          p != dir->items.end();
> @@ -3707,6 +3714,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
>                                 dnl->is_remote() ? dnl->get_remote_d_type():0,
>                                 dn->get_replica_nonce(),
>                                 dn->lock.get_state());
> +      dn->state_set(CDentry::STATE_REJOINING);
>        if (dnl->is_primary()) {
>         CInode *in = dnl->get_inode();
>         dout(15) << " add_strong_inode " << *in << dendl;
> @@ -3716,6 +3724,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
>                                  in->filelock.get_state(),
>                                  in->nestlock.get_state(),
>                                  in->dirfragtreelock.get_state());
> +       in->state_set(CInode::STATE_REJOINING);
>         in->get_nested_dirfrags(nested);
>         if (in->is_dirty_scattered()) {
>           dout(10) << " sending scatterlock state on " << *in << dendl;
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 28/39] mds: add dirty imported dirfrag to LogSegment
  2013-03-17 14:51 ` [PATCH 28/39] mds: add dirty imported dirfrag to LogSegment Yan, Zheng
@ 2013-03-21  3:14   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:14 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Whoops!
Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/CDir.cc     | 7 +++++--
>  src/mds/CDir.h      | 2 +-
>  src/mds/Migrator.cc | 2 +-
>  3 files changed, 7 insertions(+), 4 deletions(-)
>
> diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
> index af0ae9c..34bd8d3 100644
> --- a/src/mds/CDir.cc
> +++ b/src/mds/CDir.cc
> @@ -2164,7 +2164,7 @@ void CDir::finish_export(utime_t now)
>    dirty_old_rstat.clear();
>  }
>
> -void CDir::decode_import(bufferlist::iterator& blp, utime_t now)
> +void CDir::decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls)
>  {
>    ::decode(first, blp);
>    ::decode(fnode, blp);
> @@ -2177,7 +2177,10 @@ void CDir::decode_import(bufferlist::iterator& blp, utime_t now)
>    ::decode(s, blp);
>    state &= MASK_STATE_IMPORT_KEPT;
>    state |= (s & MASK_STATE_EXPORTED);
> -  if (is_dirty()) get(PIN_DIRTY);
> +  if (is_dirty()) {
> +    get(PIN_DIRTY);
> +    _mark_dirty(ls);
> +  }
>
>    ::decode(dir_rep, blp);
>
> diff --git a/src/mds/CDir.h b/src/mds/CDir.h
> index f4a3a3d..7e1db73 100644
> --- a/src/mds/CDir.h
> +++ b/src/mds/CDir.h
> @@ -550,7 +550,7 @@ public:
>    void abort_export() {
>      put(PIN_TEMPEXPORTING);
>    }
> -  void decode_import(bufferlist::iterator& blp, utime_t now);
> +  void decode_import(bufferlist::iterator& blp, utime_t now, LogSegment *ls);
>
>
>    // -- auth pins --
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index 833df12..d626cb1 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -2397,7 +2397,7 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp,
>    dout(7) << "decode_import_dir " << *dir << dendl;
>
>    // assimilate state
> -  dir->decode_import(blp, now);
> +  dir->decode_import(blp, now, ls);
>
>    // mark  (may already be marked from get_or_open_dir() above)
>    if (!dir->is_auth())
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 29/39] mds: avoid double auth pin for file recovery
  2013-03-17 14:51 ` [PATCH 29/39] mds: avoid double auth pin for file recovery Yan, Zheng
@ 2013-03-21  3:20   ` Gregory Farnum
  2013-03-21  3:33     ` Yan, Zheng
  2013-03-21 21:58     ` Gregory Farnum
  0 siblings, 2 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:20 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

This looks good on its face but I haven't had the chance to dig
through the recovery queue stuff yet (it's on my list following some
issues with recovery speed). How'd you run across this? If it's being
added to the recovery queue multiple times I want to make sure we
don't have some other machinery trying to dequeue it multiple times,
or a single waiter which needs to be a list or something.
-Greg

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 973a4d0..e9a79cd 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -5502,8 +5502,10 @@ void MDCache::_queue_file_recover(CInode *in)
>    dout(15) << "_queue_file_recover " << *in << dendl;
>    assert(in->is_auth());
>    in->state_clear(CInode::STATE_NEEDSRECOVER);
> -  in->state_set(CInode::STATE_RECOVERING);
> -  in->auth_pin(this);
> +  if (!in->state_test(CInode::STATE_RECOVERING)) {
> +    in->state_set(CInode::STATE_RECOVERING);
> +    in->auth_pin(this);
> +  }
>    file_recover_queue.insert(in);
>  }
>
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 27/39] mds: send lock action message when auth MDS is in proper state.
  2013-03-21  3:12   ` Gregory Farnum
@ 2013-03-21  3:20     ` Yan, Zheng
  0 siblings, 0 replies; 117+ messages in thread
From: Yan, Zheng @ 2013-03-21  3:20 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: ceph-devel, Sage Weil

On 03/21/2013 11:12 AM, Gregory Farnum wrote:
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> For rejoining object, don't send lock ACK message because lock states
>> are still uncertain. The lock ACK may confuse object's auth MDS and
>> trigger assertion.
>>
>> If object's auth MDS is not active, just skip sending NUDGE, REQRDLOCK
>> and REQSCATTER messages. MDCache::handle_mds_recovery() will take care
>> of them.
>>
>> Also defer caps release message until clientreplay or active
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>> ---
>>  src/mds/Locker.cc  | 46 ++++++++++++++++++++++++++++++----------------
>>  src/mds/MDCache.cc | 13 +++++++++++--
>>  2 files changed, 41 insertions(+), 18 deletions(-)
>>
>> diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
>> index 28920d4..ece39e3 100644
>> --- a/src/mds/Locker.cc
>> +++ b/src/mds/Locker.cc
>> @@ -658,6 +658,13 @@ void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, list<C
>>        // replica: tell auth
>>        int auth = lock->get_parent()->authority().first;
>>
>> +      if (lock->get_parent()->is_rejoining() &&
>> +         mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
>> +       dout(7) << "eval_gather finished gather, but still rejoining "
>> +               << *lock->get_parent() << dendl;
>> +       return;
>> +      }
>> +
>>        if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
>>         switch (lock->get_state()) {
>>         case LOCK_SYNC_LOCK:
>> @@ -1050,9 +1057,11 @@ bool Locker::_rdlock_kick(SimpleLock *lock, bool as_anon)
>>      } else {
>>        // request rdlock state change from auth
>>        int auth = lock->get_parent()->authority().first;
>> -      dout(10) << "requesting rdlock from auth on "
>> -              << *lock << " on " << *lock->get_parent() << dendl;
>> -      mds->send_message_mds(new MLock(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth);
>> +      if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
>> +       dout(10) << "requesting rdlock from auth on "
>> +                << *lock << " on " << *lock->get_parent() << dendl;
>> +       mds->send_message_mds(new MLock(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth);
>> +      }
>>        return false;
>>      }
>>    }
>> @@ -1272,9 +1281,11 @@ bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mut, bool nowait)
>>        // replica.
>>        // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case).
>>        int auth = lock->get_parent()->authority().first;
>> -      dout(10) << "requesting scatter from auth on "
>> -              << *lock << " on " << *lock->get_parent() << dendl;
>> -      mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth);
>> +      if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
>> +       dout(10) << "requesting scatter from auth on "
>> +                << *lock << " on " << *lock->get_parent() << dendl;
>> +       mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth);
>> +      }
>>        break;
>>      }
>>    }
>> @@ -1899,13 +1910,19 @@ void Locker::request_inode_file_caps(CInode *in)
>>      }
>>
>>      int auth = in->authority().first;
>> +    if (in->is_rejoining() &&
>> +       mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
>> +      mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in));
>> +      return;
>> +    }
>> +
>>      dout(7) << "request_inode_file_caps " << ccap_string(wanted)
>>              << " was " << ccap_string(in->replica_caps_wanted)
>>              << " on " << *in << " to mds." << auth << dendl;
>>
>>      in->replica_caps_wanted = wanted;
>>
>> -    if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN)
>> +    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth))
>>        mds->send_message_mds(new MInodeFileCaps(in->ino(), in->replica_caps_wanted),
>>                             auth);
>>    }
>> @@ -1924,14 +1941,6 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m)
>>    assert(in);
>>    assert(in->is_auth());
>>
>> -  if (mds->is_rejoin() &&
>> -      in->is_rejoining()) {
>> -    dout(7) << "handle_inode_file_caps still rejoining " << *in << ", dropping " << *m << dendl;
>> -    m->put();
>> -    return;
>> -  }
> 
> This is okay since we catch it in the follow-on functions (I assume
> that's why you removed it, to avoid checks at more levels than
> necessary), but if you could note that's why in the commit message
> it'll prevent anyone else from needing to go check like I did. :)
> 

if an inode is auth, it can not be rejoining. that's why I removed it.

Thanks
Yan, Zheng


> The code looks good.
> Reviewed-by: Greg Farnum <greg@inktank.com>
> 
>> -
>> -
>>    dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl;
>>
>>    if (m->get_caps())
>> @@ -2850,6 +2859,11 @@ void Locker::handle_client_cap_release(MClientCapRelease *m)
>>    client_t client = m->get_source().num();
>>    dout(10) << "handle_client_cap_release " << *m << dendl;
>>
>> +  if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
>> +    mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
>> +    return;
>> +  }
>> +
>>    for (vector<ceph_mds_cap_item>::iterator p = m->caps.begin(); p != m->caps.end(); ++p) {
>>      inodeno_t ino((uint64_t)p->ino);
>>      CInode *in = mdcache->get_inode(ino);
>> @@ -3859,7 +3873,7 @@ void Locker::scatter_nudge(ScatterLock *lock, Context *c, bool forcelockchange)
>>              << *lock << " on " << *p << dendl;
>>      // request unscatter?
>>      int auth = lock->get_parent()->authority().first;
>> -    if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_ACTIVE)
>> +    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(auth))
>>        mds->send_message_mds(new MLock(lock, LOCK_AC_NUDGE, mds->get_nodeid()), auth);
>>
>>      // wait...
>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> index 459b400..973a4d0 100644
>> --- a/src/mds/MDCache.cc
>> +++ b/src/mds/MDCache.cc
>> @@ -3321,8 +3321,10 @@ void MDCache::recalc_auth_bits()
>>
>>    if (root) {
>>      root->inode_auth.first = mds->mdsmap->get_root();
>> -    if (mds->whoami != root->inode_auth.first)
>> +    if (mds->whoami != root->inode_auth.first) {
>>        root->state_clear(CInode::STATE_AUTH);
>> +      root->state_set(CInode::STATE_REJOINING);
>> +    }
>>    }
>>
>>    set<CInode*> subtree_inodes;
>> @@ -3336,8 +3338,10 @@ void MDCache::recalc_auth_bits()
>>         ++p) {
>>
>>      CInode *inode = p->first->get_inode();
>> -    if (inode->is_mdsdir() && inode->ino() != MDS_INO_MDSDIR(mds->get_nodeid()))
>> +    if (inode->is_mdsdir() && inode->ino() != MDS_INO_MDSDIR(mds->get_nodeid())) {
>>        inode->state_clear(CInode::STATE_AUTH);
>> +      inode->state_set(CInode::STATE_REJOINING);
>> +    }
>>
>>      list<CDir*> dfq;  // dirfrag queue
>>      dfq.push_back(p->first);
>> @@ -3542,6 +3546,7 @@ void MDCache::rejoin_send_rejoins()
>>                                     root->filelock.get_state(),
>>                                     root->nestlock.get_state(),
>>                                     root->dirfragtreelock.get_state());
>> +       root->state_set(CInode::STATE_REJOINING);
>>         if (root->is_dirty_scattered()) {
>>           dout(10) << " sending scatterlock state on root " << *root << dendl;
>>           p->second->add_scatterlock_state(root);
>> @@ -3555,6 +3560,7 @@ void MDCache::rejoin_send_rejoins()
>>                                     in->filelock.get_state(),
>>                                     in->nestlock.get_state(),
>>                                     in->dirfragtreelock.get_state());
>> +       in->state_set(CInode::STATE_REJOINING);
>>        }
>>      }
>>    }
>> @@ -3694,6 +3700,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
>>      // STRONG
>>      dout(15) << " add_strong_dirfrag " << *dir << dendl;
>>      rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
>> +    dir->state_set(CDir::STATE_REJOINING);
>>
>>      for (CDir::map_t::iterator p = dir->items.begin();
>>          p != dir->items.end();
>> @@ -3707,6 +3714,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
>>                                 dnl->is_remote() ? dnl->get_remote_d_type():0,
>>                                 dn->get_replica_nonce(),
>>                                 dn->lock.get_state());
>> +      dn->state_set(CDentry::STATE_REJOINING);
>>        if (dnl->is_primary()) {
>>         CInode *in = dnl->get_inode();
>>         dout(15) << " add_strong_inode " << *in << dendl;
>> @@ -3716,6 +3724,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
>>                                  in->filelock.get_state(),
>>                                  in->nestlock.get_state(),
>>                                  in->dirfragtreelock.get_state());
>> +       in->state_set(CInode::STATE_REJOINING);
>>         in->get_nested_dirfrags(nested);
>>         if (in->is_dirty_scattered()) {
>>           dout(10) << " sending scatterlock state on " << *in << dendl;
>> --
>> 1.7.11.7
>>


^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 30/39] mds: check MDS peer's state through mdsmap
  2013-03-17 14:51 ` [PATCH 30/39] mds: check MDS peer's state through mdsmap Yan, Zheng
@ 2013-03-21  3:24   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:24 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Yep.
Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Migrator.cc | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index d626cb1..143d71e 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -238,7 +238,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
>         export_unlock(dir);
>         export_locks.erase(dir);
>         dir->state_clear(CDir::STATE_EXPORTING);
> -       if (export_peer[dir] != who) // tell them.
> +       if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
>           mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
>         break;
>
> @@ -247,7 +247,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
>         dir->unfreeze_tree();  // cancel the freeze
>         export_state.erase(dir); // clean up
>         dir->state_clear(CDir::STATE_EXPORTING);
> -       if (export_peer[dir] != who) // tell them.
> +       if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
>           mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
>         break;
>
> @@ -278,7 +278,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
>         export_unlock(dir);
>         export_locks.erase(dir);
>         dir->state_clear(CDir::STATE_EXPORTING);
> -       if (export_peer[dir] != who) // tell them.
> +       if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
>           mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
>         break;
>
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 31/39] mds: unfreeze subtree if import aborts in PREPPED state
  2013-03-17 14:51 ` [PATCH 31/39] mds: unfreeze subtree if import aborts in PREPPED state Yan, Zheng
@ 2013-03-21  3:27   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:27 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Migrator.cc | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index 143d71e..963706c 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -1658,11 +1658,14 @@ void Migrator::handle_export_cancel(MExportDirCancel *m)
>      CInode *in = cache->get_inode(df.ino);
>      assert(in);
>      import_reverse_discovered(df, in);
> -  } else if (import_state[df] == IMPORT_PREPPING ||
> -            import_state[df] == IMPORT_PREPPED) {
> +  } else if (import_state[df] == IMPORT_PREPPING) {
>      CDir *dir = mds->mdcache->get_dirfrag(df);
>      assert(dir);
>      import_reverse_prepping(dir);
> +  } else if (import_state[df] == IMPORT_PREPPED) {
> +    CDir *dir = mds->mdcache->get_dirfrag(df);
> +    assert(dir);
> +    import_reverse_unfreeze(dir);
>    } else {
>      assert(0 == "got export_cancel in weird state");
>    }
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 32/39] mds: fix export cancel notification
  2013-03-17 14:51 ` [PATCH 32/39] mds: fix export cancel notification Yan, Zheng
@ 2013-03-21  3:31   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:31 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> The comment says that if the importer is dead, bystanders thinks the
> exporter is the only auth, as per mdcache->handle_mds_failure(). But
> there is no such code in MDCache::handle_mds_failure().
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Migrator.cc | 20 +++++---------------
>  1 file changed, 5 insertions(+), 15 deletions(-)
>
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index 963706c..40a5394 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -1390,17 +1390,9 @@ void Migrator::export_logged_finish(CDir *dir)
>    for (set<int>::iterator p = export_notify_ack_waiting[dir].begin();
>         p != export_notify_ack_waiting[dir].end();
>         ++p) {
> -    MExportDirNotify *notify;
> -    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir]))
> -      // dest is still alive.
> -      notify = new MExportDirNotify(dir->dirfrag(), true,
> -                                   pair<int,int>(mds->get_nodeid(), dest),
> -                                   pair<int,int>(dest, CDIR_AUTH_UNKNOWN));
> -    else
> -      // dest is dead.  bystanders will think i am only auth, as per mdcache->handle_mds_failure()
> -      notify = new MExportDirNotify(dir->dirfrag(), true,
> -                                   pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN),
> -                                   pair<int,int>(dest, CDIR_AUTH_UNKNOWN));
> +    MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), true,
> +                                                   pair<int,int>(mds->get_nodeid(), dest),
> +                                                   pair<int,int>(dest, CDIR_AUTH_UNKNOWN));
>
>      for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
>        notify->get_bounds().push_back((*i)->dirfrag());
> @@ -2115,11 +2107,9 @@ void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
>    for (set<int>::iterator p = import_bystanders[dir].begin();
>         p != import_bystanders[dir].end();
>         ++p) {
> -    // NOTE: the bystander will think i am _only_ auth, because they will have seen
> -    // the exporter's failure and updated the subtree auth.  see mdcache->handle_mds_failure().
> -    MExportDirNotify *notify =
> +    MExportDirNotify *notify =
>        new MExportDirNotify(dir->dirfrag(), true,
> -                          pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN),
> +                          pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
>                            pair<int,int>(import_peer[dir->dirfrag()], CDIR_AUTH_UNKNOWN));
>      for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
>        notify->get_bounds().push_back((*i)->dirfrag());
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 29/39] mds: avoid double auth pin for file recovery
  2013-03-21  3:20   ` Gregory Farnum
@ 2013-03-21  3:33     ` Yan, Zheng
  2013-03-21  4:20       ` Sage Weil
  2013-03-21 21:58     ` Gregory Farnum
  1 sibling, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-21  3:33 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: ceph-devel, Sage Weil

On 03/21/2013 11:20 AM, Gregory Farnum wrote:
> This looks good on its face but I haven't had the chance to dig
> through the recovery queue stuff yet (it's on my list following some
> issues with recovery speed). How'd you run across this? If it's being
> added to the recovery queue multiple times I want to make sure we
> don't have some other machinery trying to dequeue it multiple times,
> or a single waiter which needs to be a list or something.
> -Greg

Two clients that were writing the same file crashed successively.

Thanks,
Yan, Zheng

> 
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>> ---
>>  src/mds/MDCache.cc | 6 ++++--
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> index 973a4d0..e9a79cd 100644
>> --- a/src/mds/MDCache.cc
>> +++ b/src/mds/MDCache.cc
>> @@ -5502,8 +5502,10 @@ void MDCache::_queue_file_recover(CInode *in)
>>    dout(15) << "_queue_file_recover " << *in << dendl;
>>    assert(in->is_auth());
>>    in->state_clear(CInode::STATE_NEEDSRECOVER);
>> -  in->state_set(CInode::STATE_RECOVERING);
>> -  in->auth_pin(this);
>> +  if (!in->state_test(CInode::STATE_RECOVERING)) {
>> +    in->state_set(CInode::STATE_RECOVERING);
>> +    in->auth_pin(this);
>> +  }
>>    file_recover_queue.insert(in);
>>  }
>>
>> --
>> 1.7.11.7
>>


^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 33/39] mds: notify bystanders if export aborts
  2013-03-17 14:51 ` [PATCH 33/39] mds: notify bystanders if export aborts Yan, Zheng
@ 2013-03-21  3:34   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:34 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> So bystanders know the subtree is single auth earlier.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Migrator.cc | 34 ++++++++++++++++++++++++++--------
>  src/mds/Migrator.h  |  1 +
>  2 files changed, 27 insertions(+), 8 deletions(-)
>
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index 40a5394..0672d03 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -251,25 +251,28 @@ void Migrator::handle_mds_failure_or_stop(int who)
>           mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
>         break;
>
> -       // NOTE: state order reversal, warning comes after loggingstart+prepping
> +       // NOTE: state order reversal, warning comes after prepping
>        case EXPORT_WARNING:
>         dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
>         // fall-thru
>
>        case EXPORT_PREPPING:
>         if (p->second != EXPORT_WARNING)
> -         dout(10) << "export state=loggingstart|prepping : unpinning bounds, unfreezing" << dendl;
> +         dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
>         {
>           // unpin bounds
>           set<CDir*> bounds;
>           cache->get_subtree_bounds(dir, bounds);
> -         for (set<CDir*>::iterator p = bounds.begin();
> -              p != bounds.end();
> -              ++p) {
> -           CDir *bd = *p;
> +         for (set<CDir*>::iterator q = bounds.begin();
> +              q != bounds.end();
> +              ++q) {
> +           CDir *bd = *q;
>             bd->put(CDir::PIN_EXPORTBOUND);
>             bd->state_clear(CDir::STATE_EXPORTBOUND);
>           }
> +         // notify bystanders
> +         if (p->second == EXPORT_WARNING)
> +           export_notify_abort(dir, bounds);
>         }
>         dir->unfreeze_tree();
>         export_state.erase(dir); // clean up
> @@ -1307,9 +1310,21 @@ void Migrator::handle_export_ack(MExportDirAck *m)
>    m->put();
>  }
>
> +void Migrator::export_notify_abort(CDir *dir, set<CDir*>& bounds)
> +{
> +  dout(7) << "export_notify_abort " << *dir << dendl;
>
> -
> -
> +  for (set<int>::iterator p = export_notify_ack_waiting[dir].begin();
> +       p != export_notify_ack_waiting[dir].end();
> +       ++p) {
> +    MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), false,
> +                                                   pair<int,int>(mds->get_nodeid(),export_peer[dir]),
> +                                                   pair<int,int>(mds->get_nodeid(),CDIR_AUTH_UNKNOWN));
> +    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
> +      notify->get_bounds().push_back((*i)->dirfrag());
> +    mds->send_message_mds(notify, *p);
> +  }
> +}
>
>  /*
>   * this happens if hte dest failes after i send teh export data but before it is acked
> @@ -1356,6 +1371,9 @@ void Migrator::export_reverse(CDir *dir)
>      bd->state_clear(CDir::STATE_EXPORTBOUND);
>    }
>
> +  // notify bystanders
> +  export_notify_abort(dir, bounds);
> +
>    // process delayed expires
>    cache->process_delayed_expire(dir);
>
> diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
> index 2889a74..f395bc1 100644
> --- a/src/mds/Migrator.h
> +++ b/src/mds/Migrator.h
> @@ -227,6 +227,7 @@ public:
>    void export_go(CDir *dir);
>    void export_go_synced(CDir *dir);
>    void export_reverse(CDir *dir);
> +  void export_notify_abort(CDir *dir, set<CDir*>& bounds);
>    void handle_export_ack(MExportDirAck *m);
>    void export_logged_finish(CDir *dir);
>    void handle_export_notify_ack(MExportDirNotifyAck *m);
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 34/39] mds: don't open dirfrag while subtree is frozen
  2013-03-17 14:51 ` [PATCH 34/39] mds: don't open dirfrag while subtree is frozen Yan, Zheng
@ 2013-03-21  3:38   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:38 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index e9a79cd..30687ec 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -7101,9 +7101,9 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin,     // wh
>      if (!curdir) {
>        if (cur->is_auth()) {
>          // parent dir frozen_dir?
> -        if (cur->is_frozen_dir()) {
> -          dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << dendl;
> -          cur->get_parent_dn()->get_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
> +        if (cur->is_frozen()) {
> +          dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
> +          cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
>            return 1;
>          }
>          curdir = cur->get_or_open_dirfrag(this, fg);
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 35/39] mds: clear dirty inode rstat if import fails
  2013-03-17 14:51 ` [PATCH 35/39] mds: clear dirty inode rstat if import fails Yan, Zheng
@ 2013-03-21  3:40   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:40 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/CDir.cc     | 1 +
>  src/mds/Migrator.cc | 2 ++
>  2 files changed, 3 insertions(+)
>
> diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
> index 34bd8d3..47b6753 100644
> --- a/src/mds/CDir.cc
> +++ b/src/mds/CDir.cc
> @@ -1022,6 +1022,7 @@ void CDir::assimilate_dirty_rstat_inodes()
>    for (elist<CInode*>::iterator p = dirty_rstat_inodes.begin_use_current();
>         !p.end(); ++p) {
>      CInode *in = *p;
> +    assert(in->is_auth());
>      if (in->is_frozen())
>        continue;
>
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index 0672d03..f563b8d 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -2052,6 +2052,8 @@ void Migrator::import_reverse(CDir *dir)
>         in->clear_replica_map();
>         if (in->is_dirty())
>           in->mark_clean();
> +       in->clear_dirty_rstat();
> +
>         in->authlock.clear_gather();
>         in->linklock.clear_gather();
>         in->dirfragtreelock.clear_gather();
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 36/39] mds: try merging subtree after clear EXPORTBOUND
  2013-03-17 14:51 ` [PATCH 36/39] mds: try merging subtree after clear EXPORTBOUND Yan, Zheng
@ 2013-03-21  3:44   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:44 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Migrator.cc | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index f563b8d..9cbad87 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -1340,10 +1340,6 @@ void Migrator::export_reverse(CDir *dir)
>    set<CDir*> bounds;
>    cache->get_subtree_bounds(dir, bounds);
>
> -  // adjust auth, with possible subtree merge.
> -  cache->adjust_subtree_auth(dir, mds->get_nodeid());
> -  cache->try_subtree_merge(dir);  // NOTE: may journal subtree_map as side-effect
> -
>    // remove exporting pins
>    list<CDir*> rq;
>    rq.push_back(dir);
> @@ -1371,6 +1367,10 @@ void Migrator::export_reverse(CDir *dir)
>      bd->state_clear(CDir::STATE_EXPORTBOUND);
>    }
>
> +  // adjust auth, with possible subtree merge.
> +  cache->adjust_subtree_auth(dir, mds->get_nodeid());
> +  cache->try_subtree_merge(dir);  // NOTE: may journal subtree_map as side-effect
> +
>    // notify bystanders
>    export_notify_abort(dir, bounds);
>
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 37/39] mds: eval inodes with caps imported by cache rejoin message
  2013-03-17 14:51 ` [PATCH 37/39] mds: eval inodes with caps imported by cache rejoin message Yan, Zheng
@ 2013-03-21  3:45   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:45 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 30687ec..24f1109 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -3823,6 +3823,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
>         dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
>         rejoin_import_cap(in, q->first, q->second, from);
>        }
> +      mds->locker->eval(in, CEPH_CAP_LOCKS, true);
>      }
>    } else {
>      assert(mds->is_rejoin());
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 38/39] mds: don't replicate purging dentry
  2013-03-17 14:51 ` [PATCH 38/39] mds: don't replicate purging dentry Yan, Zheng
@ 2013-03-21  3:46   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:46 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> open_remote_ino is racy, it's possible someone deletes the inode's
> last linkage while the MDS is discovering the inode.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 24f1109..d730ff1 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -9225,8 +9225,15 @@ void MDCache::handle_discover(MDiscover *dis)
>      if (dis->get_want_ino()) {
>        // lookup by ino
>        CInode *in = get_inode(dis->get_want_ino(), snapid);
> -      if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir)
> +      if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir) {
>         dn = in->get_parent_dn();
> +       if (dn->state_test(CDentry::STATE_PURGING)) {
> +         // set error flag in reply
> +         dout(7) << "dentry " << *dn << " is purging, flagging error ino" << dendl;
> +         reply->set_flag_error_ino();
> +         break;
> +       }
> +      }
>      } else if (dis->get_want().depth() > 0) {
>        // lookup dentry
>        dn = curdir->lookup(dis->get_dentry(i), snapid);
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 39/39] mds: clear scatter dirty if replica inode has no auth subtree
  2013-03-17 14:51 ` [PATCH 39/39] mds: clear scatter dirty if replica inode has no auth subtree Yan, Zheng
@ 2013-03-21  3:49   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21  3:49 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Reviewed-by: Greg Farnum <greg@inktank.com>

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> This avoids sending superfluous scatterlock state to recovering MDS
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/CInode.cc   |  5 +++--
>  src/mds/CInode.h    |  2 +-
>  src/mds/MDCache.cc  | 13 ++++++-------
>  src/mds/Migrator.cc | 15 +++++++++++++++
>  4 files changed, 25 insertions(+), 10 deletions(-)
>
> diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
> index 42137f3..25cb6c1 100644
> --- a/src/mds/CInode.cc
> +++ b/src/mds/CInode.cc
> @@ -615,12 +615,13 @@ void CInode::close_dirfrags()
>      close_dirfrag(dirfrags.begin()->first);
>  }
>
> -bool CInode::has_subtree_root_dirfrag()
> +bool CInode::has_subtree_root_dirfrag(int auth)
>  {
>    for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
>         p != dirfrags.end();
>         ++p)
> -    if (p->second->is_subtree_root())
> +    if (p->second->is_subtree_root() &&
> +       (auth == -1 || p->second->dir_auth.first == auth))
>        return true;
>    return false;
>  }
> diff --git a/src/mds/CInode.h b/src/mds/CInode.h
> index f7b8f33..bea7430 100644
> --- a/src/mds/CInode.h
> +++ b/src/mds/CInode.h
> @@ -344,7 +344,7 @@ public:
>    CDir *add_dirfrag(CDir *dir);
>    void close_dirfrag(frag_t fg);
>    void close_dirfrags();
> -  bool has_subtree_root_dirfrag();
> +  bool has_subtree_root_dirfrag(int auth=-1);
>
>    void force_dirfrags();
>    void verify_dirfrags();
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index d730ff1..75c7ded 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -3330,8 +3330,10 @@ void MDCache::recalc_auth_bits()
>    set<CInode*> subtree_inodes;
>    for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
>         p != subtrees.end();
> -       ++p)
> -    subtree_inodes.insert(p->first->inode);
> +       ++p) {
> +    if (p->first->dir_auth.first == mds->get_nodeid())
> +      subtree_inodes.insert(p->first->inode);
> +  }
>
>    for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
>         p != subtrees.end();
> @@ -3390,11 +3392,8 @@ void MDCache::recalc_auth_bits()
>             if (dnl->get_inode()->is_dirty())
>               dnl->get_inode()->mark_clean();
>             // avoid touching scatterlocks for our subtree roots!
> -           if (subtree_inodes.count(dnl->get_inode()) == 0) {
> -             dnl->get_inode()->filelock.remove_dirty();
> -             dnl->get_inode()->nestlock.remove_dirty();
> -             dnl->get_inode()->dirfragtreelock.remove_dirty();
> -           }
> +           if (subtree_inodes.count(dnl->get_inode()) == 0)
> +             dnl->get_inode()->clear_scatter_dirty();
>           }
>
>           // recurse?
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index 9cbad87..49d21ab 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -1095,6 +1095,10 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, list<Context*>& fini
>
>    in->clear_dirty_rstat();
>
> +  // no more auth subtree? clear scatter dirty
> +  if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
> +    in->clear_scatter_dirty();
> +
>    in->item_open_file.remove_myself();
>
>    // waiters
> @@ -1534,6 +1538,11 @@ void Migrator::export_finish(CDir *dir)
>    cache->adjust_subtree_auth(dir, export_peer[dir]);
>    cache->try_subtree_merge(dir);  // NOTE: may journal subtree_map as sideeffect
>
> +  // no more auth subtree? clear scatter dirty
> +  if (!dir->get_inode()->is_auth() &&
> +      !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid()))
> +    dir->get_inode()->clear_scatter_dirty();
> +
>    // unpin path
>    export_unlock(dir);
>
> @@ -2020,6 +2029,10 @@ void Migrator::import_reverse(CDir *dir)
>      cache->trim_non_auth_subtree(dir);
>    cache->adjust_subtree_auth(dir, import_peer[dir->dirfrag()]);
>
> +  if (!dir->get_inode()->is_auth() &&
> +      !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid()))
> +    dir->get_inode()->clear_scatter_dirty();
> +
>    // adjust auth bits.
>    list<CDir*> q;
>    q.push_back(dir);
> @@ -2053,6 +2066,8 @@ void Migrator::import_reverse(CDir *dir)
>         if (in->is_dirty())
>           in->mark_clean();
>         in->clear_dirty_rstat();
> +       if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
> +         in->clear_scatter_dirty();
>
>         in->authlock.clear_gather();
>         in->linklock.clear_gather();
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 11/39] mds: don't delay processing replica buffer in slave request
  2013-03-21  2:38     ` Yan, Zheng
@ 2013-03-21  4:15       ` Sage Weil
  2013-03-21 21:48         ` Gregory Farnum
  0 siblings, 1 reply; 117+ messages in thread
From: Sage Weil @ 2013-03-21  4:15 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Greg Farnum, ceph-devel

On Thu, 21 Mar 2013, Yan, Zheng wrote:
> On 03/21/2013 05:19 AM, Greg Farnum wrote:
> > On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
> >> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> >>
> >> Replicated objects need to be added into the cache immediately
> >>
> >> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> > Why do we need to add them right away? Shouldn't we have a journaled replica if we need it?
> > -Greg
> 
> The issue I encountered is lock action message received, but replicated objects wasn't in the
> cache because slave request was delayed.

This makes sense to me; the add_replica_*() methods that create and push 
replicas of cache objects to other nodes need to always be applied 
immediately, or else the cache coherency falls apart.

There are similar games played between the client and mds with the caps 
protocol, although in that case IIRC there are certain limited 
circumstances where we can delay processing the message.  For mds->mds 
traffic, I don't think that's possible, unless *all* potentially dependent 
traffic is also delayed to preserve ordering and so forth.

[That said, I didn't review the actual patch :)]

sage

> 
> Thanks
> Yan, Zheng
> 
> 
> > 
> > Software Engineer #42 @ http://inktank.com | http://ceph.com
> >> ---
> >> src/mds/MDCache.cc | 12 ++++++++++++
> >> src/mds/MDCache.h | 2 +-
> >> src/mds/MDS.cc | 6 +++---
> >> src/mds/Server.cc | 55 +++++++++++++++++++++++++++++++++++++++---------------
> >> 4 files changed, 56 insertions(+), 19 deletions(-)
> >>
> >> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> >> index 0f6b842..b668842 100644
> >> --- a/src/mds/MDCache.cc
> >> +++ b/src/mds/MDCache.cc
> >> @@ -7722,6 +7722,18 @@ void MDCache::_find_ino_dir(inodeno_t ino, Context *fin, bufferlist& bl, int r)
> >>
> >> /* ---------------------------- */
> >>
> >> +int MDCache::get_num_client_requests()
> >> +{
> >> + int count = 0;
> >> + for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
> >> + p != active_requests.end();
> >> + ++p) {
> >> + if (p->second->reqid.name.is_client() && !p->second->is_slave())
> >> + count++;
> >> + }
> >> + return count;
> >> +}
> >> +
> >> /* This function takes over the reference to the passed Message */
> >> MDRequest *MDCache::request_start(MClientRequest *req)
> >> {
> >> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> >> index a9f05c6..4634121 100644
> >> --- a/src/mds/MDCache.h
> >> +++ b/src/mds/MDCache.h
> >> @@ -240,7 +240,7 @@ protected:
> >> hash_map<metareqid_t, MDRequest*> active_requests; 
> >>
> >> public:
> >> - int get_num_active_requests() { return active_requests.size(); }
> >> + int get_num_client_requests();
> >>
> >> MDRequest* request_start(MClientRequest *req);
> >> MDRequest* request_start_slave(metareqid_t rid, __u32 attempt, int by);
> >> diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
> >> index b91dcbd..e99eecc 100644
> >> --- a/src/mds/MDS.cc
> >> +++ b/src/mds/MDS.cc
> >> @@ -1900,9 +1900,9 @@ bool MDS::_dispatch(Message *m)
> >> mdcache->is_open() &&
> >> replay_queue.empty() &&
> >> want_state == MDSMap::STATE_CLIENTREPLAY) {
> >> - dout(10) << " still have " << mdcache->get_num_active_requests()
> >> - << " active replay requests" << dendl;
> >> - if (mdcache->get_num_active_requests() == 0)
> >> + int num_requests = mdcache->get_num_client_requests();
> >> + dout(10) << " still have " << num_requests << " active replay requests" << dendl;
> >> + if (num_requests == 0)
> >> clientreplay_done();
> >> }
> >>
> >> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
> >> index 4c4c86b..8e89e4c 100644
> >> --- a/src/mds/Server.cc
> >> +++ b/src/mds/Server.cc
> >> @@ -107,10 +107,8 @@ void Server::dispatch(Message *m)
> >> (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
> >> (static_cast<MClientRequest*>(m))->is_replay()))) {
> >> // replaying!
> >> - } else if (mds->is_clientreplay() && m->get_type() == MSG_MDS_SLAVE_REQUEST &&
> >> - ((static_cast<MMDSSlaveRequest*>(m))->is_reply() ||
> >> - !mds->mdsmap->is_active(m->get_source().num()))) {
> >> - // slave reply or the master is also in the clientreplay stage
> >> + } else if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
> >> + // handle_slave_request() will wait if necessary
> >> } else {
> >> dout(3) << "not active yet, waiting" << dendl;
> >> mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
> >> @@ -1291,6 +1289,13 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
> >> if (m->is_reply())
> >> return handle_slave_request_reply(m);
> >>
> >> + CDentry *straydn = NULL;
> >> + if (m->stray.length() > 0) {
> >> + straydn = mdcache->add_replica_stray(m->stray, from);
> >> + assert(straydn);
> >> + m->stray.clear();
> >> + }
> >> +
> >> // am i a new slave?
> >> MDRequest *mdr = NULL;
> >> if (mdcache->have_request(m->get_reqid())) {
> >> @@ -1326,9 +1331,26 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
> >> m->put();
> >> return;
> >> }
> >> - mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m->get_source().num());
> >> + mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), from);
> >> }
> >> assert(mdr->slave_request == 0); // only one at a time, please! 
> >> +
> >> + if (straydn) {
> >> + mdr->pin(straydn);
> >> + mdr->straydn = straydn;
> >> + }
> >> +
> >> + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
> >> + dout(3) << "not clientreplay|active yet, waiting" << dendl;
> >> + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
> >> + return;
> >> + } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
> >> + mdr->locks.empty()) {
> >> + dout(3) << "not active yet, waiting" << dendl;
> >> + mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
> >> + return;
> >> + }
> >> +
> >> mdr->slave_request = m;
> >>
> >> dispatch_slave_request(mdr);
> >> @@ -1339,6 +1361,12 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
> >> {
> >> int from = m->get_source().num();
> >>
> >> + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
> >> + dout(3) << "not clientreplay|active yet, waiting" << dendl;
> >> + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
> >> + return;
> >> + }
> >> +
> >> if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
> >> metareqid_t r = m->get_reqid();
> >> mds->mdcache->committed_master_slave(r, from);
> >> @@ -5138,10 +5166,8 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr)
> >> dout(10) << " dn " << *dn << dendl;
> >> mdr->pin(dn);
> >>
> >> - assert(mdr->slave_request->stray.length() > 0);
> >> - CDentry *straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
> >> - assert(straydn);
> >> - mdr->pin(straydn);
> >> + assert(mdr->straydn);
> >> + CDentry *straydn = mdr->straydn;
> >> dout(10) << " straydn " << *straydn << dendl;
> >>
> >> mdr->now = mdr->slave_request->now;
> >> @@ -5208,6 +5234,7 @@ void Server::_logged_slave_rmdir(MDRequest *mdr, CDentry *dn, CDentry *straydn)
> >> // done.
> >> mdr->slave_request->put();
> >> mdr->slave_request = 0;
> >> + mdr->straydn = 0;
> >> }
> >>
> >> void Server::handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
> >> @@ -6460,15 +6487,12 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
> >> // stray?
> >> bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
> >> (srcdnl->is_primary() || destdnl->is_primary()));
> >> - CDentry *straydn = 0;
> >> - if (destdnl->is_primary() && !linkmerge) {
> >> - assert(mdr->slave_request->stray.length() > 0);
> >> - straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
> >> + CDentry *straydn = mdr->straydn;
> >> + if (destdnl->is_primary() && !linkmerge)
> >> assert(straydn);
> >> - mdr->pin(straydn);
> >> - }
> >>
> >> mdr->now = mdr->slave_request->now;
> >> + mdr->more()->srcdn_auth_mds = srcdn->authority().first;
> >>
> >> // set up commit waiter (early, to clean up any freezing etc we do)
> >> if (!mdr->more()->slave_commit)
> >> @@ -6651,6 +6675,7 @@ void Server::_logged_slave_rename(MDRequest *mdr,
> >> // done.
> >> mdr->slave_request->put();
> >> mdr->slave_request = 0;
> >> + mdr->straydn = 0;
> >> }
> >>
> >> void Server::_commit_slave_rename(MDRequest *mdr, int r,
> >> -- 
> >> 1.7.11.7
> > 
> > 
> > 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 29/39] mds: avoid double auth pin for file recovery
  2013-03-21  3:33     ` Yan, Zheng
@ 2013-03-21  4:20       ` Sage Weil
  0 siblings, 0 replies; 117+ messages in thread
From: Sage Weil @ 2013-03-21  4:20 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Gregory Farnum, ceph-devel

On Thu, 21 Mar 2013, Yan, Zheng wrote:
> On 03/21/2013 11:20 AM, Gregory Farnum wrote:
> > This looks good on its face but I haven't had the chance to dig
> > through the recovery queue stuff yet (it's on my list following some
> > issues with recovery speed). How'd you run across this? If it's being
> > added to the recovery queue multiple times I want to make sure we
> > don't have some other machinery trying to dequeue it multiple times,
> > or a single waiter which needs to be a list or something.
> > -Greg
> 
> Two clients that were writing the same file crashed successively.

Hmm, I would love to have a test case for this.  It should be pretty easy 
to construct some tests with libcephfs that fork, connect and do some 
operations, and are then killed by the parent, who verifies the resulting 
recovery occurs.  This is some of the more fragile, not just because it is 
rarely tested.

sage



> 
> Thanks,
> Yan, Zheng
> 
> > 
> > On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> >> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> >>
> >> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> >> ---
> >>  src/mds/MDCache.cc | 6 ++++--
> >>  1 file changed, 4 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> >> index 973a4d0..e9a79cd 100644
> >> --- a/src/mds/MDCache.cc
> >> +++ b/src/mds/MDCache.cc
> >> @@ -5502,8 +5502,10 @@ void MDCache::_queue_file_recover(CInode *in)
> >>    dout(15) << "_queue_file_recover " << *in << dendl;
> >>    assert(in->is_auth());
> >>    in->state_clear(CInode::STATE_NEEDSRECOVER);
> >> -  in->state_set(CInode::STATE_RECOVERING);
> >> -  in->auth_pin(this);
> >> +  if (!in->state_test(CInode::STATE_RECOVERING)) {
> >> +    in->state_set(CInode::STATE_RECOVERING);
> >> +    in->auth_pin(this);
> >> +  }
> >>    file_recover_queue.insert(in);
> >>  }
> >>
> >> --
> >> 1.7.11.7
> >>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack
  2013-03-20 23:33   ` Gregory Farnum
  2013-03-20 23:40     ` Gregory Farnum
@ 2013-03-21  6:41     ` Yan, Zheng
  2013-03-21 21:58       ` Gregory Farnum
  1 sibling, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-21  6:41 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: ceph-devel, Sage Weil

On 03/21/2013 07:33 AM, Gregory Farnum wrote:
> This needs to handle versioning the encoding based on peer feature bits too.
> 
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> Cache rejoin ack message already encodes inode base, make it also encode
>> dirfrag base. This allowes the message to replicate stray dentries like
>> MDentryUnlink message. The function will be used by later patch.
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>> ---
>>  src/mds/CDir.h                 | 20 +++++++++++++-------
>>  src/mds/MDCache.cc             | 20 ++++++++++++++++++--
>>  src/messages/MMDSCacheRejoin.h | 12 +++++++++++-
>>  3 files changed, 42 insertions(+), 10 deletions(-)
>>
>> diff --git a/src/mds/CDir.h b/src/mds/CDir.h
>> index 79946f1..f4a3a3d 100644
>> --- a/src/mds/CDir.h
>> +++ b/src/mds/CDir.h
>> @@ -437,23 +437,29 @@ private:
>>      ::encode(dist, bl);
>>    }
>>
>> -  void encode_replica(int who, bufferlist& bl) {
>> -    __u32 nonce = add_replica(who);
>> -    ::encode(nonce, bl);
>> +  void _encode_base(bufferlist& bl) {
>>      ::encode(first, bl);
>>      ::encode(fnode, bl);
>>      ::encode(dir_rep, bl);
>>      ::encode(dir_rep_by, bl);
>>    }
>> -  void decode_replica(bufferlist::iterator& p) {
>> -    __u32 nonce;
>> -    ::decode(nonce, p);
>> -    replica_nonce = nonce;
>> +  void _decode_base(bufferlist::iterator& p) {
>>      ::decode(first, p);
>>      ::decode(fnode, p);
>>      ::decode(dir_rep, p);
>>      ::decode(dir_rep_by, p);
>>    }
>> +  void encode_replica(int who, bufferlist& bl) {
>> +    __u32 nonce = add_replica(who);
>> +    ::encode(nonce, bl);
>> +    _encode_base(bl);
>> +  }
>> +  void decode_replica(bufferlist::iterator& p) {
>> +    __u32 nonce;
>> +    ::decode(nonce, p);
>> +    replica_nonce = nonce;
>> +    _decode_base(p);
>> +  }
>>
>>
>>
>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> index 8ba676e..344777e 100644
>> --- a/src/mds/MDCache.cc
>> +++ b/src/mds/MDCache.cc
>> @@ -4510,8 +4510,22 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>>      }
>>    }
>>
>> +  // full dirfrags
>> +  bufferlist::iterator p = ack->dirfrag_base.begin();
>> +  while (!p.end()) {
>> +    dirfrag_t df;
>> +    bufferlist basebl;
>> +    ::decode(df, p);
>> +    ::decode(basebl, p);
>> +    CDir *dir = get_dirfrag(df);
>> +    assert(dir);
>> +    bufferlist::iterator q = basebl.begin();
>> +    dir->_decode_base(q);
>> +    dout(10) << " got dir replica " << *dir << dendl;
>> +  }
>> +
>>    // full inodes
>> -  bufferlist::iterator p = ack->inode_base.begin();
>> +  p = ack->inode_base.begin();
>>    while (!p.end()) {
>>      inodeno_t ino;
>>      snapid_t last;
>> @@ -5178,8 +5192,10 @@ void MDCache::rejoin_send_acks()
>>        // dir
>>        for (map<int,int>::iterator r = dir->replicas_begin();
>>            r != dir->replicas_end();
>> -          ++r)
>> +          ++r) {
>>         ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
>> +       ack[r->first]->add_dirfrag_base(dir);
>> +      }
>>
>>        for (CDir::map_t::iterator q = dir->items.begin();
>>            q != dir->items.end();
>> diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
>> index b88f551..7c37ab4 100644
>> --- a/src/messages/MMDSCacheRejoin.h
>> +++ b/src/messages/MMDSCacheRejoin.h
>> @@ -20,6 +20,7 @@
>>  #include "include/types.h"
>>
>>  #include "mds/CInode.h"
>> +#include "mds/CDir.h"
>>
>>  // sent from replica to auth
>>
>> @@ -169,6 +170,7 @@ class MMDSCacheRejoin : public Message {
>>    // full
>>    bufferlist inode_base;
>>    bufferlist inode_locks;
>> +  bufferlist dirfrag_base;
>>
>>    // authpins, xlocks
>>    struct slave_reqid {
>> @@ -258,7 +260,13 @@ public:
>>    void add_strong_dirfrag(dirfrag_t df, int n, int dr) {
>>      strong_dirfrags[df] = dirfrag_strong(n, dr);
>>    }
>> -
>> +  void add_dirfrag_base(CDir *dir) {
>> +    ::encode(dir->dirfrag(), dirfrag_base);
>> +    bufferlist bl;
>> +    dir->_encode_base(bl);
>> +    ::encode(bl, dirfrag_base);
>> +  }
> 
> We are guilty of doing this in other places, but we should avoid
> implicit encodings like this one, especially when the decode happens
> somewhere else like it does here. We can make a vector dirfrag_bases
> and add to that, and then encode and decode it along with the rest of
> the message — would that work for your purposes?
> -Greg
> 

update this patch or send a new patch that updates both {inode,dirfrag}_base?

Thanks
Yan, Zheng

>> +
>>    // dentries
>>    void add_weak_dirfrag(dirfrag_t df) {
>>      weak_dirfrags.insert(df);
>> @@ -294,6 +302,7 @@ public:
>>      ::encode(wrlocked_inodes, payload);
>>      ::encode(cap_export_bl, payload);
>>      ::encode(strong_dirfrags, payload);
>> +    ::encode(dirfrag_base, payload);
>>      ::encode(weak, payload);
>>      ::encode(weak_dirfrags, payload);
>>      ::encode(weak_inodes, payload);
>> @@ -319,6 +328,7 @@ public:
>>        ::decode(cap_export_paths, q);
>>      }
>>      ::decode(strong_dirfrags, p);
>> +    ::decode(dirfrag_base, p);
>>      ::decode(weak, p);
>>      ::decode(weak_dirfrags, p);
>>      ::decode(weak_inodes, p);
>> --
>> 1.7.11.7
>>

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-20 18:31           ` Greg Farnum
@ 2013-03-21  8:07             ` Yan, Zheng
  2013-03-21 22:03               ` Gregory Farnum
  0 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-21  8:07 UTC (permalink / raw)
  To: Greg Farnum; +Cc: Sage Weil, ceph-devel

On 03/21/2013 02:31 AM, Greg Farnum wrote:
> On Tuesday, March 19, 2013 at 11:49 PM, Yan, Zheng wrote:
>> On 03/20/2013 02:15 PM, Sage Weil wrote:
>>> On Wed, 20 Mar 2013, Yan, Zheng wrote:
>>>> On 03/20/2013 07:09 AM, Greg Farnum wrote:
>>>>> Hmm, this is definitely narrowing the race (probably enough to never hit it), but it's not actually eliminating it (if the restart happens after 4 billion requests?). More importantly this kind of symptom makes me worry that we might be papering over more serious issues with colliding states in the Table on restart.
>>>>> I don't have the MDSTable semantics in my head so I'll need to look into this later unless somebody else volunteers to do so?
>>>>  
>>>>  
>>>>  
>>>> Not just 4 billion requests, MDS restart has several stage, mdsmap epoch  
>>>> increases for each stage. I don't think there are any more colliding  
>>>> states in the table. The table client/server use two phase commit. it's  
>>>> similar to client request that involves multiple MDS. the reqid is  
>>>> analogy to client request id. The difference is client request ID is  
>>>> unique because new client always get an unique session id.
>>>  
>>>  
>>>  
>>> Each time a tid is consumed (at least for an update) it is journaled in  
>>> the EMetaBlob::table_tids list, right? So we could actually take a max  
>>> from journal replay and pick up where we left off? That seems like the  
>>> cleanest.
>>>  
>>> I'm not too worried about 2^32 tids, I guess, but it would be nicer to  
>>> avoid that possibility.
>>  
>>  
>>  
>> Can we re-use the client request ID as table client request ID ?
>>  
>> Regards
>> Yan, Zheng
> 
> Not sure what you're referring to here — do you mean the ID of the filesystem client request which prompted the update? I don't think that would work as client requests actually require two parts to be unique (the client GUID and the request seq number), and I'm pretty sure a single client request can spawn multiple Table updates.
> 

You are right, client request ID does not work.

> As I look over this more, it sure looks to me as if the effect of the code we have (when non-broken) is to rollback every non-committed request by an MDS which restarted — the only time it can handle the TableServer's "agree" with a different response is if the MDS was incorrectly marked out by the map. Am I parsing this correctly, Sage? Given that, and without having looked at the code more broadly, I think we want to add some sort of implicit or explicit handshake letting each of them know if the MDS actually disappeared. We use the process/address nonce to accomplish this in other places…
> -Greg
> 

The table server sends 'agree' message to table client after a 'prepare entry' is safely logged. The table server re-sends 'agree' message in two cases, one is the table client restarts, another is the table server itself restarts.
The purpose of re-sending 'agree' message is to check if the table client still wants to keep the update preparation. (The table client might crash before submitting the update). The purpose of reqid is associate table update
preparation request with the server's 'agree' reply message. The problem here is that the table client does not make sure reqid unique between restarts. If you feel 2^32 reqids are still enough, set the reqid to a randomized 64bit
value should be safe enough.

Thanks
Yan, Zheng
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 18/39] mds: fix MDS recovery involving cross authority rename
  2013-03-17 14:51 ` [PATCH 18/39] mds: fix MDS recovery involving cross authority rename Yan, Zheng
@ 2013-03-21 17:59   ` Gregory Farnum
  2013-03-22  3:04     ` Yan, Zheng
  0 siblings, 1 reply; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21 17:59 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Sage Weil, ceph-devel

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> For mds cluster, rename operation may involve multiple MDS. If the
> rename source's auth MDS crashes after some witness MDS have prepared
> the rename but before the rename is committing. Later when the MDS
> recovers, its subtree map and linkages are different from the prepared
> MDS'. This causes problems for both subtree resolve and cache rejoin.
> The solution is, if the rename source's auth MDS fails, the prepared
> witness MDS query the master MDS if the operation is committing. If
> it's not, rollback the rename, then send resolve message to the
> recovering MDS.
>
> Another similar case is a prepared witness MDS crashes when the
> rename source's auth MDS has prepared or is preparing the operation.
> when the witness recovers, the master just delay sending the resolve
> ack message until the it commits the operation.
>
> This patch also updates Server::handle_client_rename(). Make preparing
> the rename source's auth MDS be the final step before committing the
> rename.

Why? It's not immediately obvious to me what the benefit is, and the
commit message should state it. :)

>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc |  75 +++++++++++++++++++++++++++++-----------
>  src/mds/MDCache.h  |  17 +++++++--
>  src/mds/Mutation.h |   2 ++
>  src/mds/Server.cc  | 100 ++++++++++++++++++++++++++++-------------------------
>  4 files changed, 124 insertions(+), 70 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 9b37b1e..d934020 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -2491,7 +2491,7 @@ void MDCache::send_slave_resolves()
>        if (!p->second->is_slave() || !p->second->slave_did_prepare())
>         continue;
>        int master = p->second->slave_to_mds;
> -      if (resolve_set.count(master)) {
> +      if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
>         dout(10) << " including uncommitted " << *p->second << dendl;
>         if (!resolves.count(master))
>           resolves[master] = new MMDSResolve;
> @@ -2610,6 +2610,7 @@ void MDCache::handle_mds_failure(int who)
>
>    resolve_gather.insert(who);
>    discard_delayed_resolve(who);
> +  ambiguous_slave_updates.erase(who);
>
>    rejoin_gather.insert(who);
>    rejoin_sent.erase(who);        // i need to send another
> @@ -2642,14 +2643,46 @@ void MDCache::handle_mds_failure(int who)
>           finish.push_back(p->second);
>        }
>      }
> +
> +    if (p->second->is_slave() &&
> +       p->second->slave_did_prepare() && p->second->more()->srcdn_auth_mds == who &&
> +       mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
> +      // rename srcdn's auth mds failed, resolve even I'm a survivor.
> +      dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
> +      add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
> +    }
>
>      // failed node is slave?
>      if (p->second->is_master() && !p->second->committing) {
> +      if (p->second->more()->srcdn_auth_mds == who) {
> +       dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
> +                << who << " to recover" << dendl;
> +       assert(p->second->more()->witnessed.count(who) == 0);
> +       if (p->second->more()->is_ambiguous_auth)
> +         p->second->clear_ambiguous_auth();
> +       // rename srcdn's auth mds failed, all witnesses will rollback
> +       p->second->more()->witnessed.clear();
> +       pending_masters.erase(p->first);
> +      }
> +
>        if (p->second->more()->witnessed.count(who)) {
> -       dout(10) << " master request " << *p->second << " no longer witnessed by slave mds." << who
> -                << dendl;
> -       // discard this peer's prepare (if any)
> -       p->second->more()->witnessed.erase(who);
> +       int srcdn_auth = p->second->more()->srcdn_auth_mds;
> +       if (srcdn_auth >= 0 && p->second->more()->waiting_on_slave.count(srcdn_auth)) {
> +         dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
> +                  << p->second->more()->srcdn_auth_mds << " to reply" << dendl;
> +         // waiting for the last slave (rename srcdn's auth mds), delay sending resolve ack
> +         // until either the request is committing or the last slave also fails.
> +         assert(p->second->more()->waiting_on_slave.size() == 1);
> +         pending_masters.insert(p->first);

The language about "last slave" is confusing me here — I'm with you
that this rename should only have one slave, but I don't think it ever
should have had more than one. Do you mean "only slave" or am I
missing something?

> +       } else {
> +         dout(10) << " master request " << *p->second << " no longer witnessed by slave mds."
> +                  << who << " to recover" << dendl;
> +         if (srcdn_auth >= 0)
> +           assert(p->second->more()->witnessed.count(srcdn_auth) == 0);
> +
> +         // discard this peer's prepare (if any)
> +         p->second->more()->witnessed.erase(who);
> +       }
>        }
>
>        if (p->second->more()->waiting_on_slave.count(who)) {
> @@ -2657,14 +2690,8 @@ void MDCache::handle_mds_failure(int who)
>                  << " to recover" << dendl;
>         // retry request when peer recovers
>         p->second->more()->waiting_on_slave.erase(who);
> -       mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
> -      }
> -
> -      if (p->second->has_more() && p->second->more()->is_ambiguous_auth &&
> -         p->second->more()->rename_inode->authority().first == who) {
> -       dout(10) << " master request " << *p->second << " waiting for renamed inode's auth mds." << who
> -                << " to recover" << dendl;
> -       p->second->clear_ambiguous_auth();

Why are you getting rid of waiting for the renamed inode's MDS? I
could be misremembering, but I believe we need it, and it might be
different from the source or dest dentry auths.

> +       if (p->second->more()->waiting_on_slave.empty())
> +         mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
>        }
>
>        if (p->second->locking && p->second->locking_target_mds == who)
> @@ -2951,16 +2978,27 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
>    dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
>    int from = ack->get_source().num();
>
> -  if (!resolve_ack_gather.count(from)) {
> +  if (!resolve_ack_gather.count(from) ||
> +      mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
>      ack->put();
>      return;
>    }
>
> +  if (ambiguous_slave_updates.count(from)) {
> +    assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
> +    assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
> +  }
> +
>    for (vector<metareqid_t>::iterator p = ack->commit.begin();
>         p != ack->commit.end();
>         ++p) {
>      dout(10) << " commit on slave " << *p << dendl;
>
> +    if (ambiguous_slave_updates.count(from)) {
> +      remove_ambiguous_slave_update(*p, from);
> +      continue;
> +    }
> +
>      if (mds->is_resolve()) {
>        // replay
>        MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
> @@ -3020,13 +3058,8 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
>      }
>    }
>
> -  if (!mds->is_resolve()) {
> -    for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
> -       p != active_requests.end(); ++p)
> -      assert(p->second->slave_to_mds != from);
> -  }
> -
> -  resolve_ack_gather.erase(from);
> +  if (!ambiguous_slave_updates.count(from))
> +    resolve_ack_gather.erase(from);
>    if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
>      send_subtree_resolves();
>      process_delayed_resolve();
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index 8f262b9..a05ced7 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -327,9 +327,8 @@ protected:
>    map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
>
>    set<metareqid_t>             pending_masters;
> +  map<int, set<metareqid_t> >  ambiguous_slave_updates;
>
> -  //map<metareqid_t, bool>     ambiguous_slave_updates;         // for log trimming.
> -  //map<metareqid_t, Context*> waiting_for_slave_update_commit;
>    friend class ESlaveUpdate;
>    friend class ECommitted;
>
> @@ -353,6 +352,20 @@ protected:
>  public:
>    void remove_inode_recursive(CInode *in);
>
> +  bool is_ambiguous_slave_update(metareqid_t reqid, int master) {
> +    return ambiguous_slave_updates.count(master) &&
> +          ambiguous_slave_updates[master].count(reqid);
> +  }
> +  void add_ambiguous_slave_update(metareqid_t reqid, int master) {
> +    ambiguous_slave_updates[master].insert(reqid);
> +  }
> +  void remove_ambiguous_slave_update(metareqid_t reqid, int master) {
> +    assert(ambiguous_slave_updates[master].count(reqid));
> +    ambiguous_slave_updates[master].erase(reqid);
> +    if (ambiguous_slave_updates[master].empty())
> +      ambiguous_slave_updates.erase(master);
> +  }
> +
>    void add_rollback(metareqid_t reqid, int master) {
>      need_resolve_rollback[reqid] = master;
>    }
> diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
> index 5013f04..de122a5 100644
> --- a/src/mds/Mutation.h
> +++ b/src/mds/Mutation.h
> @@ -207,6 +207,7 @@ struct MDRequest : public Mutation {
>
>      // for rename
>      set<int> extra_witnesses; // replica list from srcdn auth (rename)
> +    int srcdn_auth_mds;
>      version_t src_reanchor_atid;  // src->dst
>      version_t dst_reanchor_atid;  // dst->stray
>      bufferlist inode_import;
> @@ -233,6 +234,7 @@ struct MDRequest : public Mutation {
>      bufferlist rollback_bl;
>
>      More() :
> +      srcdn_auth_mds(-1),
>        src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
>        rename_inode(0), is_freeze_authpin(false), is_ambiguous_auth(false),
>        is_remote_frozen_authpin(false), is_inode_exporter(false),
> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
> index 1330f11..b6e5665 100644
> --- a/src/mds/Server.cc
> +++ b/src/mds/Server.cc
> @@ -5772,12 +5772,52 @@ void Server::handle_client_rename(MDRequest *mdr)
>    if (mdr->now == utime_t())
>      mdr->now = ceph_clock_now(g_ceph_context);
>
> +  // -- prepare anchor updates --
> +  if (!linkmerge || srcdnl->is_primary()) {
> +    C_GatherBuilder anchorgather(g_ceph_context);
> +
> +    if (srcdnl->is_primary() &&
> +      (srcdnl->get_inode()->is_anchored() ||
> +       (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
> +                                          srcdnl->get_inode()->nested_anchors ||
> +                                          !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
> +      !mdr->more()->src_reanchor_atid) {
> +      dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
> +      vector<Anchor> trace;
> +      destdn->make_anchor_trace(trace, srcdnl->get_inode());
> +      mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
> +                                       trace, &mdr->more()->src_reanchor_atid,
> +                                       anchorgather.new_sub());
> +    }
> +    if (destdnl->is_primary() &&
> +       destdnl->get_inode()->is_anchored() &&
> +       !mdr->more()->dst_reanchor_atid) {
> +      dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
> +
> +      assert(straydn);
> +      vector<Anchor> trace;
> +      straydn->make_anchor_trace(trace, destdnl->get_inode());
> +
> +      mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
> +                 &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
> +    }
> +
> +    if (anchorgather.has_subs())  {
> +      anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
> +      anchorgather.activate();
> +      return;  // waiting for anchor prepares
> +    }
> +
> +    assert(g_conf->mds_kill_rename_at != 2);
> +  }
> +
>    // -- prepare witnesses --
>
>    // do srcdn auth last
>    int last = -1;
>    if (!srcdn->is_auth()) {
>      last = srcdn->authority().first;
> +    mdr->more()->srcdn_auth_mds = last;
>      // ask auth of srci to mark srci as ambiguous auth if more than two MDS
>      // are involved in the rename operation.
>      if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
> @@ -5803,58 +5843,18 @@ void Server::handle_client_rename(MDRequest *mdr)
>    if (!mdr->more()->waiting_on_slave.empty())
>      return;  // we're waiting for a witness.
>
> -  if (last >= 0 &&
> -      mdr->more()->witnessed.count(last) == 0 &&
> -      mdr->more()->waiting_on_slave.count(last) == 0) {
> +  if (last >= 0 && mdr->more()->witnessed.count(last) == 0) {
>      dout(10) << " preparing last witness (srcdn auth)" << dendl;
> +    assert(mdr->more()->waiting_on_slave.count(last) == 0);
>      _rename_prepare_witness(mdr, last, witnesses, srcdn, destdn, straydn);
>      return;
>    }
>
>    // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
>    if (!mdr->more()->slaves.empty() && !srci->is_dir())
> -    assert(g_conf->mds_kill_rename_at != 2);
> +    assert(g_conf->mds_kill_rename_at != 3);
>    if (!mdr->more()->slaves.empty() && srci->is_dir())
> -    assert(g_conf->mds_kill_rename_at != 3);
> -
> -  // -- prepare anchor updates --
> -  if (!linkmerge || srcdnl->is_primary()) {
> -    C_GatherBuilder anchorgather(g_ceph_context);
> -
> -    if (srcdnl->is_primary() &&
> -       (srcdnl->get_inode()->is_anchored() ||
> -        (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
> -                                           srcdnl->get_inode()->nested_anchors ||
> -                                           !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
> -       !mdr->more()->src_reanchor_atid) {
> -      dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
> -      vector<Anchor> trace;
> -      destdn->make_anchor_trace(trace, srcdnl->get_inode());
> -      mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
> -                                       trace, &mdr->more()->src_reanchor_atid,
> -                                       anchorgather.new_sub());
> -    }
> -    if (destdnl->is_primary() &&
> -       destdnl->get_inode()->is_anchored() &&
> -       !mdr->more()->dst_reanchor_atid) {
> -      dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
> -
> -      assert(straydn);
> -      vector<Anchor> trace;
> -      straydn->make_anchor_trace(trace, destdnl->get_inode());
> -
> -      mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
> -                 &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
> -    }
> -
> -    if (anchorgather.has_subs())  {
> -      anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
> -      anchorgather.activate();
> -      return;  // waiting for anchor prepares
> -    }
> -
>      assert(g_conf->mds_kill_rename_at != 4);
> -  }
>
>    // -- prepare journal entry --
>    mdr->ls = mdlog->get_current_segment();
> @@ -6762,10 +6762,17 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
>      // abort
>      //  rollback_bl may be empty if we froze the inode but had to provide an expanded
>      // witness list from the master, and they failed before we tried prep again.
> -    if (mdr->more()->rollback_bl.length())
> -      do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
> -    else
> +    if (mdr->more()->rollback_bl.length()) {
> +      if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
> +       mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
> +       // rollback but preserve the slave request
> +       do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL);
> +      } else
> +       do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
> +    } else {
>        dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
> +      mds->mdcache->request_finish(mdr);
> +    }
>    }
>  }
>
> @@ -6825,7 +6832,6 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
>    dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
>    // need to finish this update before sending resolve to claim the subtree
>    mds->mdcache->add_rollback(rollback.reqid, master);
> -  assert(mdr || mds->is_resolve());
>
>    Mutation *mut = new Mutation(rollback.reqid);
>    mut->ls = mds->mdlog->get_current_segment();
> --
> 1.7.11.7
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 22/39] mds: handle linkage mismatch during cache rejoin
  2013-03-17 14:51 ` [PATCH 22/39] mds: handle linkage mismatch during cache rejoin Yan, Zheng
@ 2013-03-21 21:23   ` Gregory Farnum
  2013-03-22  3:05     ` Yan, Zheng
  2013-03-26  7:21     ` Yan, Zheng
  0 siblings, 2 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21 21:23 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> For MDS cluster, not all file system namespace operations that impact
> multiple MDS use two phase commit. Some operations use dentry link/unlink
> message to update replica dentry's linkage after they are committed by
> the master MDS. It's possible the master MDS crashes after journaling an
> operation, but before sending the dentry link/unlink messages. Later when
> the MDS recovers and receives cache rejoin messages from the surviving
> MDS, it will find linkage mismatch.

I think you're here talking about link/unlink, and the MDS crashing
after it's sent out the LogEvent to the OSD but it hasn't actually
dispatched the observer slave requests. Is that right? This commit
message really confused me; I was trying to figure out which namespace
operations were hacking around a proper 2-phase commit by unlinking
and relinking inodes into the tree! (The link/unlink code also is
doing a 2-phase commit, it just doesn't force a particular order for
the journaling, which was previously left unhandled).

>
> The original cache rejoin code does not properly handle the case that
> dentry unlink messages were missing. Unlinked inodes were linked to stray
> dentries. So the cache rejoin ack message need push replicas of these
> stray dentries to the surviving MDS.
>
> This patch also adds code that handles cache expiration in the middle of
> cache rejoining.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 348 +++++++++++++++++++++++++++++++++++------------------
>  src/mds/MDCache.h  |   1 +
>  2 files changed, 233 insertions(+), 116 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 344777e..38b1fdf 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -3536,7 +3536,6 @@ void MDCache::rejoin_send_rejoins()
>      } else {
>        // strong
>        if (p->first == 0 && root) {
> -       p->second->add_weak_inode(root->vino());
>         p->second->add_strong_inode(root->vino(),
>                                     root->get_replica_nonce(),
>                                     root->get_caps_wanted(),
> @@ -3550,7 +3549,6 @@ void MDCache::rejoin_send_rejoins()
>        }
>
>        if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
> -       p->second->add_weak_inode(in->vino());
>         p->second->add_strong_inode(in->vino(),
>                                     in->get_replica_nonce(),
>                                     in->get_caps_wanted(),
> @@ -3567,6 +3565,8 @@ void MDCache::rejoin_send_rejoins()
>      for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
>          p != active_requests.end();
>          ++p) {
> +      if ( p->second->is_slave())
> +       continue;
>        // auth pins
>        for (set<MDSCacheObject*>::iterator q = p->second->remote_auth_pins.begin();
>            q != p->second->remote_auth_pins.end();
> @@ -4226,6 +4226,8 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>      rejoin_potential_updated_scatterlocks.insert(in);
>    }
>
> +  rejoin_unlinked_inodes[from].clear();
> +
>    // surviving peer may send incorrect dirfrag here (maybe they didn't
>    // get the fragment notify, or maybe we rolled back?).  we need to
>    // infer the right frag and get them with the program.  somehow.
> @@ -4332,105 +4334,125 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>
>        dn->add_replica(from, q->second.nonce);
>        dout(10) << " have " << *dn << dendl;
> -
> -      // inode?
> -      if (dnl->is_primary()) {
> -       CInode *in = dnl->get_inode();
> -       assert(in);
> -
> -       if (strong->strong_inodes.count(in->vino())) {
> -         MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->vino()];
>
> -         // caps_wanted
> -         if (is.caps_wanted) {
> -           in->mds_caps_wanted[from] = is.caps_wanted;
> -           dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
> -                    << " on " << *in << dendl;
> -         }
> -
> -         // scatterlocks?
> -         //  infer state from replica state:
> -         //   * go to MIX if they might have wrlocks
> -         //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
> -         in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
> -         in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
> -         in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
> -
> -         // auth pin?
> -         if (strong->authpinned_inodes.count(in->vino())) {
> -           MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
> -           dout(10) << " inode authpin by " << r << " on " << *in << dendl;
> -
> -           // get/create slave mdrequest
> -           MDRequest *mdr;
> -           if (have_request(r.reqid))
> -             mdr = request_get(r.reqid);
> -           else
> -             mdr = request_start_slave(r.reqid, r.attempt, from);
> -           if (strong->frozen_authpin_inodes.count(in->vino())) {
> -             assert(!in->get_num_auth_pins());
> -             mdr->freeze_auth_pin(in);
> -           } else {
> -             assert(!in->is_frozen_auth_pin());
> -           }
> -           mdr->auth_pin(in);
> -         }
> -         // xlock(s)?
> -         if (strong->xlocked_inodes.count(in->vino())) {
> -           for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->xlocked_inodes[in->vino()].begin();
> -                r != strong->xlocked_inodes[in->vino()].end();
> -                ++r) {
> -             SimpleLock *lock = in->get_lock(r->first);
> -             dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl;
> -             MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
> -             assert(mdr->is_auth_pinned(in));
> -             if (lock->is_stable())
> -               in->auth_pin(lock);
> -             lock->set_state(LOCK_XLOCK);
> -             if (lock == &in->filelock)
> -               in->loner_cap = -1;
> -             lock->get_xlock(mdr, mdr->get_client());
> -             mdr->xlocks.insert(lock);
> -             mdr->locks.insert(lock);
> -           }
> -         }
> -         // wrlock(s)?
> -         if (strong->wrlocked_inodes.count(in->vino())) {
> -           for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->wrlocked_inodes[in->vino()].begin();
> -                r != strong->wrlocked_inodes[in->vino()].end();
> -                ++r) {
> -             SimpleLock *lock = in->get_lock(r->first);
> -             dout(10) << " inode wrlock by " << r->second << " on " << *lock << " on " << *in << dendl;
> -             MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
> -             assert(mdr->is_auth_pinned(in));
> -             lock->set_state(LOCK_LOCK);
> -             if (lock == &in->filelock)
> -               in->loner_cap = -1;
> -             lock->get_wrlock(true);
> -             mdr->wrlocks.insert(lock);
> -             mdr->locks.insert(lock);
> -           }
> +      if (dnl->is_primary()) {
> +       if (q->second.is_primary()) {
> +         if (!(vinodeno_t(q->second.ino, q->first.snapid) == dnl->get_inode()->vino())) {

Maybe it's worth adding an operator!= for vinodeno_t, since you seem
to use this a couple times.

> +           // the survivor missed MDentryUnlink+MDentryLink messages ?
> +           assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
> +           CInode *in = get_inode(q->second.ino, q->first.snapid);
> +           assert(in);
> +           rejoin_unlinked_inodes[from].insert(in);
> +           dout(7) << " sender has primary dentry but wrong inode" << dendl;
>           }
>         } else {
> -         dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
> +         // the survivor missed MDentryLink message ?
> +         assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
> +         dout(7) << " sender doesn't primay dentry" << dendl;

doesn't have primary? or something else?

> +       }
> +      } else {
> +       if (q->second.is_primary()) {
> +         // the survivor missed MDentryUnlink message ?
> +         CInode *in = get_inode(q->second.ino, q->first.snapid);
> +         assert(in);
> +         rejoin_unlinked_inodes[from].insert(in);
> +         dout(7) << " sender has primary dentry but we don't" << dendl;
>         }
> -
> -       in->add_replica(from, p->second.nonce);
> -       dout(10) << " have " << *in << dendl;
>        }
>      }
>    }
>
> -  // base inodes?  (root, stray, etc.)
> -  for (set<vinodeno_t>::iterator p = strong->weak_inodes.begin();
> -       p != strong->weak_inodes.end();
> +  for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
> +       p != strong->strong_inodes.end();
>         ++p) {
> -    CInode *in = get_inode(*p);
> -    dout(10) << " have base " << *in << dendl;
> -    in->add_replica(from);
> +    CInode *in = get_inode(p->first);
> +    assert(in);
> +    in->add_replica(from, p->second.nonce);
> +    dout(10) << " have " << *in << dendl;
> +
> +    MMDSCacheRejoin::inode_strong &is = p->second;
> +
> +    // caps_wanted
> +    if (is.caps_wanted) {
> +      in->mds_caps_wanted[from] = is.caps_wanted;
> +      dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
> +              << " on " << *in << dendl;
> +    }
> +
> +    // scatterlocks?
> +    //  infer state from replica state:
> +    //   * go to MIX if they might have wrlocks
> +    //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
> +    in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
> +    in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
> +    in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
> +
> +    // auth pin?
> +    if (strong->authpinned_inodes.count(in->vino())) {
> +      MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
> +      dout(10) << " inode authpin by " << r << " on " << *in << dendl;
> +
> +      // get/create slave mdrequest
> +      MDRequest *mdr;
> +      if (have_request(r.reqid))
> +       mdr = request_get(r.reqid);
> +      else
> +       mdr = request_start_slave(r.reqid, r.attempt, from);
> +      if (strong->frozen_authpin_inodes.count(in->vino())) {
> +       assert(!in->get_num_auth_pins());
> +       mdr->freeze_auth_pin(in);
> +      } else {
> +       assert(!in->is_frozen_auth_pin());
> +      }
> +      mdr->auth_pin(in);
> +    }
> +    // xlock(s)?
> +    if (strong->xlocked_inodes.count(in->vino())) {
> +      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
> +          q != strong->xlocked_inodes[in->vino()].end();
> +          ++q) {
> +       SimpleLock *lock = in->get_lock(q->first);
> +       dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
> +       MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
> +       assert(mdr->is_auth_pinned(in));
> +       if (lock->is_stable())
> +         in->auth_pin(lock);
> +       lock->set_state(LOCK_XLOCK);
> +       if (lock == &in->filelock)
> +         in->loner_cap = -1;
> +       lock->get_xlock(mdr, mdr->get_client());
> +       mdr->xlocks.insert(lock);
> +       mdr->locks.insert(lock);
> +      }
> +    }
> +    // wrlock(s)?
> +    if (strong->wrlocked_inodes.count(in->vino())) {
> +      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->wrlocked_inodes[in->vino()].begin();
> +          q != strong->wrlocked_inodes[in->vino()].end();
> +          ++q) {
> +       SimpleLock *lock = in->get_lock(q->first);
> +       dout(10) << " inode wrlock by " << q->second << " on " << *lock << " on " << *in << dendl;
> +       MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
> +       assert(mdr->is_auth_pinned(in));
> +       lock->set_state(LOCK_LOCK);
> +       if (lock == &in->filelock)
> +         in->loner_cap = -1;
> +       lock->get_wrlock(true);
> +       mdr->wrlocks.insert(lock);
> +       mdr->locks.insert(lock);
> +      }
> +    }
>    }
>
> -
> +  // unlinked inodes should be in stray
> +  for (set<CInode*>::iterator p = rejoin_unlinked_inodes[from].begin();
> +       p != rejoin_unlinked_inodes[from].end();
> +       ++p) {
> +    CInode *in = *p;
> +    dout(7) << " unlinked inode " << *in << dendl;
> +    assert(in->get_parent_dn());
> +    assert(in->is_replica(from));
> +  }

I'm not clear on why we need to check this here — the previous for
loop wasn't adding any inodes to the cache, so shouldn't we just check
these conditions as we add them?

>
>    // done?
>    assert(rejoin_gather.count(from));
> @@ -4448,6 +4470,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>    dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
>    int from = ack->get_source().num();
>
> +  // for sending cache expire message
> +  list<CInode*> isolated_inodes;
> +
>    // dirs
>    for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
>         p != ack->strong_dirfrags.end();
> @@ -4455,7 +4480,29 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>      // we may have had incorrect dir fragmentation; refragment based
>      // on what they auth tells us.
>      CDir *dir = get_force_dirfrag(p->first);
> -    assert(dir);
> +    if (!dir) {
> +      CInode *diri = get_inode(p->first.ino);
> +      if (!diri) {
> +       // barebones inode; the full inode loop below will clean up.
> +       diri = new CInode(this, false);
> +       diri->inode.ino = p->first.ino;
> +       diri->inode.mode = S_IFDIR;
> +       if (MDS_INO_MDSDIR(p->first.ino)) {
> +         diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN);
> +         add_inode(diri);
> +         dout(10) << " add inode " << *diri << dendl;
> +       } else {
> +         diri->inode_auth = CDIR_AUTH_UNDEF;
> +         isolated_inodes.push_back(diri);
> +         dout(10) << " unconnected dirfrag " << p->first << dendl;
> +       }
> +      }
> +      // barebones dirfrag; the full dirfrag loop below will clean up.
> +      dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
> +      if (dir->authority().first != from)
> +       adjust_subtree_auth(dir, from);
> +      dout(10) << " add dirfrag " << *dir << dendl;
> +    }
>
>      dir->set_replica_nonce(p->second.nonce);
>      dir->state_clear(CDir::STATE_REJOINING);
> @@ -4467,7 +4514,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>          q != dmap.end();
>          ++q) {
>        CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
> -      assert(dn);
> +      if(!dn)
> +       dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
> +
>        CDentry::linkage_t *dnl = dn->get_linkage();
>
>        assert(dn->last == q->first.snapid);
> @@ -4476,33 +4525,48 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>         dn->first = q->second.first;
>        }
>
> +      // may have bad linkage if we missed dentry link/unlink messages
> +      if (dnl->is_primary()) {
> +       CInode *in = dnl->get_inode();
> +       if (!q->second.is_primary() ||
> +           !(vinodeno_t(q->second.ino, q->first.snapid) == in->vino())) {
> +         dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
> +         dir->unlink_inode(dn);
> +       }
> +      } else if (dnl->is_remote()) {
> +       if (!q->second.is_remote() ||
> +           q->second.remote_ino != dnl->get_remote_ino() ||
> +           q->second.remote_d_type != dnl->get_remote_d_type()) {
> +         dout(10) << " had bad linkage for " << *dn <<  dendl;
> +         dir->unlink_inode(dn);
> +       }
> +      } else {
> +       if (!q->second.is_null())
> +         dout(10) << " had bad linkage for " << *dn <<  dendl;
> +      }
> +
>        // hmm, did we have the proper linkage here?
> -      if (dnl->is_null() &&
> -         !q->second.is_null()) {
> -       dout(10) << " had bad (missing) linkage for " << *dn << dendl;
> +      if (dnl->is_null() && !q->second.is_null()) {
>         if (q->second.is_remote()) {
>           dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
>         } else {
>           CInode *in = get_inode(q->second.ino, q->first.snapid);
> -         assert(in == 0);  // a rename would have been caught be the resolve stage.
> -         // barebones inode; the full inode loop below will clean up.
> -         in = new CInode(this, false, q->second.first, q->first.snapid);
> -         in->inode.ino = q->second.ino;
> -         add_inode(in);
> +         if (!in) {
> +           // barebones inode; assume it's dir, the full inode loop below will clean up.
> +           in = new CInode(this, false, q->second.first, q->first.snapid);
> +           in->inode.ino = q->second.ino;
> +           in->inode.mode = S_IFDIR;
> +           add_inode(in);
> +           dout(10) << " add inode " << *in << dendl;
> +         } else if (in->get_parent_dn()) {
> +           dout(10) << " had bad linkage for " << *(in->get_parent_dn())
> +                    << ", unlinking " << *in << dendl;
> +           in->get_parent_dir()->unlink_inode(in->get_parent_dn());
> +         }
>           dn->dir->link_primary_inode(dn, in);
>         }
>        }
> -      else if (!dnl->is_null() &&
> -              q->second.is_null()) {
> -       dout(0) << " had bad linkage for " << *dn << dendl;
> -       /*
> -        * this should happen:
> -        *  if we're a survivor, any unlink should commit or rollback during
> -        * the resolve stage.
> -        *  if we failed, we shouldn't have non-auth leaf dentries at all
> -        */
> -       assert(0);  // uh oh.
> -      }
> +
>        dn->set_replica_nonce(q->second.nonce);
>        dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
>        dn->state_clear(CDentry::STATE_REJOINING);
> @@ -4564,6 +4628,21 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>      dout(10) << " got inode locks " << *in << dendl;
>    }
>
> +  // trim unconnected subtree
> +  if (!isolated_inodes.empty()) {
> +    map<int, MCacheExpire*> expiremap;
> +    for (list<CInode*>::iterator p = isolated_inodes.begin();
> +        p != isolated_inodes.end();
> +        ++p) {
> +      list<CDir*> ls;
> +      (*p)->get_dirfrags(ls);
> +      trim_dirfrag(*ls.begin(), 0, expiremap);
> +      assert((*p)->get_num_ref() == 0);
> +      delete *p;
> +    }
> +    send_expire_messages(expiremap);
> +  }
> +
>    // done?
>    assert(rejoin_ack_gather.count(from));
>    rejoin_ack_gather.erase(from);
> @@ -5164,6 +5243,37 @@ void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snap
>  void MDCache::rejoin_send_acks()
>  {
>    dout(7) << "rejoin_send_acks" << dendl;
> +
> +  // replicate stray
> +  for (map<int, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
> +       p != rejoin_unlinked_inodes.end();
> +       ++p) {
> +    for (set<CInode*>::iterator q = p->second.begin();
> +        q != p->second.end();
> +        ++q) {
> +      CInode *in = *q;
> +      dout(7) << " unlinked inode " << *in << dendl;
> +      // inode expired
> +      if (!in->is_replica(p->first))
> +       continue;
> +      while (1) {
> +       CDentry *dn = in->get_parent_dn();
> +       if (dn->is_replica(p->first))
> +         break;
> +       dn->add_replica(p->first);
> +       CDir *dir = dn->get_dir();
> +       if (dir->is_replica(p->first))
> +         break;
> +       dir->add_replica(p->first);
> +       in = dir->get_inode();
> +       if (in->is_replica(p->first))
> +         break;
> +       if (in->is_base())
> +         break;
> +      }
> +    }
> +  }
> +  rejoin_unlinked_inodes.clear();
>
>    // send acks to everyone in the recovery set
>    map<int,MMDSCacheRejoin*> ack;
> @@ -5203,23 +5313,29 @@ void MDCache::rejoin_send_acks()
>         CDentry *dn = q->second;
>         CDentry::linkage_t *dnl = dn->get_linkage();
>
> +       // inode
> +       CInode *in = NULL;
> +       if (dnl->is_primary())
> +         in = dnl->get_inode();
> +
>         // dentry
>         for (map<int,int>::iterator r = dn->replicas_begin();
>              r != dn->replicas_end();
> -            ++r)
> +            ++r) {
>           ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
>                                            dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
>                                            dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
>                                            dnl->is_remote() ? dnl->get_remote_d_type():0,
>                                            ++r->second,
>                                            dn->lock.get_replica_state());
> +         // peer missed MDentrylink message ?
> +         if (in && !in->is_replica(r->first))
> +           in->add_replica(r->first);
> +       }
>
> -       if (!dnl->is_primary())
> +       if (!in)
>           continue;
>
> -       // inode
> -       CInode *in = dnl->get_inode();
> -
>         for (map<int,int>::iterator r = in->replicas_begin();
>              r != in->replicas_end();
>              ++r) {
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index 85f5d65..09cc092 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -416,6 +416,7 @@ protected:
>    set<CInode*> rejoin_undef_inodes;
>    set<CInode*> rejoin_potential_updated_scatterlocks;
>    set<CDir*>   rejoin_undef_dirfrags;
> +  map<int, set<CInode*> > rejoin_unlinked_inodes;
>
>    vector<CInode*> rejoin_recover_q, rejoin_check_q;
>    list<Context*> rejoin_waiters;
> --
> 1.7.11.7
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state.
  2013-03-21  2:22     ` Yan, Zheng
@ 2013-03-21 21:43       ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21 21:43 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Wed, Mar 20, 2013 at 7:22 PM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 03/21/2013 02:40 AM, Greg Farnum wrote:
>> The idea of this patch makes sense, but I'm not sure if we guarantee that each daemon sees every map update — if they don't then if an MDS misses the map moving an MDS into CLIENTREPLAY then they won't process them as having recovered on the next map. Sage or Joao, what are the guarantees subscription provides?
>> -Greg
>
> See MDS::active_start(), it also kicks clientreplay waiters. And I will fix the 'clientreply' typo in my git tree.

That's for the recovering MDS — I was referring to observers who are
watching it come up. However, I've just checked and the monitor does
guarantee every-map delivery, so this isn't a problem.

Reviewed-by: Greg Farnum <greg@inktank.com>

>
> Thanks
> Yan, Zheng
>
>>
>> Software Engineer #42 @ http://inktank.com | http://ceph.com
>>
>>
>> On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
>>
>>> From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>>>
>>> MDS in clientreply state already start servering requests. It also
>>> make MDS::handle_mds_recovery() and MDS::recovery_done() match.
>>>
>>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)>
>>> ---
>>> src/mds/MDS.cc (http://MDS.cc) | 2 ++
>>> 1 file changed, 2 insertions(+)
>>>
>>> diff --git a/src/mds/MDS.cc (http://MDS.cc) b/src/mds/MDS.cc (http://MDS.cc)
>>> index 282fa64..b91dcbd 100644
>>> --- a/src/mds/MDS.cc (http://MDS.cc)
>>> +++ b/src/mds/MDS.cc (http://MDS.cc)
>>> @@ -1032,7 +1032,9 @@ void MDS::handle_mds_map(MMDSMap *m)
>>>
>>> set<int> oldactive, active;
>>> oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE);
>>> + oldmap->get_mds_set(oldactive, MDSMap::STATE_CLIENTREPLAY);
>>> mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
>>> + mdsmap->get_mds_set(active, MDSMap::STATE_CLIENTREPLAY);
>>> for (set<int>::iterator p = active.begin(); p != active.end(); ++p)
>>> if (*p != whoami && // not me
>>> oldactive.count(*p) == 0) // newly so?
>>> --
>>> 1.7.11.7
>>
>>
>>
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 11/39] mds: don't delay processing replica buffer in slave request
  2013-03-21  4:15       ` Sage Weil
@ 2013-03-21 21:48         ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21 21:48 UTC (permalink / raw)
  To: Sage Weil; +Cc: Yan, Zheng, ceph-devel

On Wed, Mar 20, 2013 at 9:15 PM, Sage Weil <sage@inktank.com> wrote:
> On Thu, 21 Mar 2013, Yan, Zheng wrote:
>> On 03/21/2013 05:19 AM, Greg Farnum wrote:
>> > On Sunday, March 17, 2013 at 7:51 AM, Yan, Zheng wrote:
>> >> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>> >>
>> >> Replicated objects need to be added into the cache immediately
>> >>
>> >> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>> > Why do we need to add them right away? Shouldn't we have a journaled replica if we need it?
>> > -Greg
>>
>> The issue I encountered is lock action message received, but replicated objects wasn't in the
>> cache because slave request was delayed.
>
> This makes sense to me; the add_replica_*() methods that create and push
> replicas of cache objects to other nodes need to always be applied
> immediately, or else the cache coherency falls apart.
>
> There are similar games played between the client and mds with the caps
> protocol, although in that case IIRC there are certain limited
> circumstances where we can delay processing the message.  For mds->mds
> traffic, I don't think that's possible, unless *all* potentially dependent
> traffic is also delayed to preserve ordering and so forth.
>
> [That said, I didn't review the actual patch :)]

Oh, I had my mind stuck on recovery but this is just generic replicas
for slave requests.

Reviewed-by: Greg Farnum <greg@inktank.com>

>
> sage
>
>>
>> Thanks
>> Yan, Zheng
>>
>>
>> >
>> > Software Engineer #42 @ http://inktank.com | http://ceph.com
>> >> ---
>> >> src/mds/MDCache.cc | 12 ++++++++++++
>> >> src/mds/MDCache.h | 2 +-
>> >> src/mds/MDS.cc | 6 +++---
>> >> src/mds/Server.cc | 55 +++++++++++++++++++++++++++++++++++++++---------------
>> >> 4 files changed, 56 insertions(+), 19 deletions(-)
>> >>
>> >> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> >> index 0f6b842..b668842 100644
>> >> --- a/src/mds/MDCache.cc
>> >> +++ b/src/mds/MDCache.cc
>> >> @@ -7722,6 +7722,18 @@ void MDCache::_find_ino_dir(inodeno_t ino, Context *fin, bufferlist& bl, int r)
>> >>
>> >> /* ---------------------------- */
>> >>
>> >> +int MDCache::get_num_client_requests()
>> >> +{
>> >> + int count = 0;
>> >> + for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
>> >> + p != active_requests.end();
>> >> + ++p) {
>> >> + if (p->second->reqid.name.is_client() && !p->second->is_slave())
>> >> + count++;
>> >> + }
>> >> + return count;
>> >> +}
>> >> +
>> >> /* This function takes over the reference to the passed Message */
>> >> MDRequest *MDCache::request_start(MClientRequest *req)
>> >> {
>> >> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
>> >> index a9f05c6..4634121 100644
>> >> --- a/src/mds/MDCache.h
>> >> +++ b/src/mds/MDCache.h
>> >> @@ -240,7 +240,7 @@ protected:
>> >> hash_map<metareqid_t, MDRequest*> active_requests;
>> >>
>> >> public:
>> >> - int get_num_active_requests() { return active_requests.size(); }
>> >> + int get_num_client_requests();
>> >>
>> >> MDRequest* request_start(MClientRequest *req);
>> >> MDRequest* request_start_slave(metareqid_t rid, __u32 attempt, int by);
>> >> diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
>> >> index b91dcbd..e99eecc 100644
>> >> --- a/src/mds/MDS.cc
>> >> +++ b/src/mds/MDS.cc
>> >> @@ -1900,9 +1900,9 @@ bool MDS::_dispatch(Message *m)
>> >> mdcache->is_open() &&
>> >> replay_queue.empty() &&
>> >> want_state == MDSMap::STATE_CLIENTREPLAY) {
>> >> - dout(10) << " still have " << mdcache->get_num_active_requests()
>> >> - << " active replay requests" << dendl;
>> >> - if (mdcache->get_num_active_requests() == 0)
>> >> + int num_requests = mdcache->get_num_client_requests();
>> >> + dout(10) << " still have " << num_requests << " active replay requests" << dendl;
>> >> + if (num_requests == 0)
>> >> clientreplay_done();
>> >> }
>> >>
>> >> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
>> >> index 4c4c86b..8e89e4c 100644
>> >> --- a/src/mds/Server.cc
>> >> +++ b/src/mds/Server.cc
>> >> @@ -107,10 +107,8 @@ void Server::dispatch(Message *m)
>> >> (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
>> >> (static_cast<MClientRequest*>(m))->is_replay()))) {
>> >> // replaying!
>> >> - } else if (mds->is_clientreplay() && m->get_type() == MSG_MDS_SLAVE_REQUEST &&
>> >> - ((static_cast<MMDSSlaveRequest*>(m))->is_reply() ||
>> >> - !mds->mdsmap->is_active(m->get_source().num()))) {
>> >> - // slave reply or the master is also in the clientreplay stage
>> >> + } else if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
>> >> + // handle_slave_request() will wait if necessary
>> >> } else {
>> >> dout(3) << "not active yet, waiting" << dendl;
>> >> mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
>> >> @@ -1291,6 +1289,13 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
>> >> if (m->is_reply())
>> >> return handle_slave_request_reply(m);
>> >>
>> >> + CDentry *straydn = NULL;
>> >> + if (m->stray.length() > 0) {
>> >> + straydn = mdcache->add_replica_stray(m->stray, from);
>> >> + assert(straydn);
>> >> + m->stray.clear();
>> >> + }
>> >> +
>> >> // am i a new slave?
>> >> MDRequest *mdr = NULL;
>> >> if (mdcache->have_request(m->get_reqid())) {
>> >> @@ -1326,9 +1331,26 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
>> >> m->put();
>> >> return;
>> >> }
>> >> - mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m->get_source().num());
>> >> + mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), from);
>> >> }
>> >> assert(mdr->slave_request == 0); // only one at a time, please!
>> >> +
>> >> + if (straydn) {
>> >> + mdr->pin(straydn);
>> >> + mdr->straydn = straydn;
>> >> + }
>> >> +
>> >> + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
>> >> + dout(3) << "not clientreplay|active yet, waiting" << dendl;
>> >> + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
>> >> + return;
>> >> + } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
>> >> + mdr->locks.empty()) {
>> >> + dout(3) << "not active yet, waiting" << dendl;
>> >> + mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
>> >> + return;
>> >> + }
>> >> +
>> >> mdr->slave_request = m;
>> >>
>> >> dispatch_slave_request(mdr);
>> >> @@ -1339,6 +1361,12 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
>> >> {
>> >> int from = m->get_source().num();
>> >>
>> >> + if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
>> >> + dout(3) << "not clientreplay|active yet, waiting" << dendl;
>> >> + mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
>> >> + return;
>> >> + }
>> >> +
>> >> if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
>> >> metareqid_t r = m->get_reqid();
>> >> mds->mdcache->committed_master_slave(r, from);
>> >> @@ -5138,10 +5166,8 @@ void Server::handle_slave_rmdir_prep(MDRequest *mdr)
>> >> dout(10) << " dn " << *dn << dendl;
>> >> mdr->pin(dn);
>> >>
>> >> - assert(mdr->slave_request->stray.length() > 0);
>> >> - CDentry *straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
>> >> - assert(straydn);
>> >> - mdr->pin(straydn);
>> >> + assert(mdr->straydn);
>> >> + CDentry *straydn = mdr->straydn;
>> >> dout(10) << " straydn " << *straydn << dendl;
>> >>
>> >> mdr->now = mdr->slave_request->now;
>> >> @@ -5208,6 +5234,7 @@ void Server::_logged_slave_rmdir(MDRequest *mdr, CDentry *dn, CDentry *straydn)
>> >> // done.
>> >> mdr->slave_request->put();
>> >> mdr->slave_request = 0;
>> >> + mdr->straydn = 0;
>> >> }
>> >>
>> >> void Server::handle_slave_rmdir_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
>> >> @@ -6460,15 +6487,12 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
>> >> // stray?
>> >> bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
>> >> (srcdnl->is_primary() || destdnl->is_primary()));
>> >> - CDentry *straydn = 0;
>> >> - if (destdnl->is_primary() && !linkmerge) {
>> >> - assert(mdr->slave_request->stray.length() > 0);
>> >> - straydn = mdcache->add_replica_stray(mdr->slave_request->stray, mdr->slave_to_mds);
>> >> + CDentry *straydn = mdr->straydn;
>> >> + if (destdnl->is_primary() && !linkmerge)
>> >> assert(straydn);
>> >> - mdr->pin(straydn);
>> >> - }
>> >>
>> >> mdr->now = mdr->slave_request->now;
>> >> + mdr->more()->srcdn_auth_mds = srcdn->authority().first;
>> >>
>> >> // set up commit waiter (early, to clean up any freezing etc we do)
>> >> if (!mdr->more()->slave_commit)
>> >> @@ -6651,6 +6675,7 @@ void Server::_logged_slave_rename(MDRequest *mdr,
>> >> // done.
>> >> mdr->slave_request->put();
>> >> mdr->slave_request = 0;
>> >> + mdr->straydn = 0;
>> >> }
>> >>
>> >> void Server::_commit_slave_rename(MDRequest *mdr, int r,
>> >> --
>> >> 1.7.11.7
>> >
>> >
>> >
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 13/39] mds: don't send resolve message between active MDS
  2013-03-21  2:55     ` Yan, Zheng
@ 2013-03-21 21:55       ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21 21:55 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Wed, Mar 20, 2013 at 7:55 PM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 03/21/2013 05:56 AM, Gregory Farnum wrote:
>> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>>
>>> When MDS cluster is resolving, current behavior is sending subtree resolve
>>> message to all other MDS and waiting for all other MDS' resolve message.
>>> The problem is that active MDS can have diffent subtree map due to rename.
>>> Besides gathering active MDS's resolve messages are also racy. The only
>>> function for these messages is disambiguate other MDS' import. We can
>>> replace it by import finish notification.
>>>
>>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>>> ---
>>>  src/mds/MDCache.cc  | 12 +++++++++---
>>>  src/mds/Migrator.cc | 25 +++++++++++++++++++++++--
>>>  src/mds/Migrator.h  |  3 ++-
>>>  3 files changed, 34 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>>> index c455a20..73c1d59 100644
>>> --- a/src/mds/MDCache.cc
>>> +++ b/src/mds/MDCache.cc
>>> @@ -2517,7 +2517,8 @@ void MDCache::send_subtree_resolves()
>>>         ++p) {
>>>      if (*p == mds->whoami)
>>>        continue;
>>> -    resolves[*p] = new MMDSResolve;
>>> +    if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
>>> +      resolves[*p] = new MMDSResolve;
>>>    }
>>>
>>>    // known
>>> @@ -2837,7 +2838,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
>>>           migrator->import_reverse(dir);
>>>         } else {
>>>           dout(7) << "ambiguous import succeeded on " << *dir << dendl;
>>> -         migrator->import_finish(dir);
>>> +         migrator->import_finish(dir, true);
>>>         }
>>>         my_ambiguous_imports.erase(p);  // no longer ambiguous.
>>>        }
>>> @@ -3432,7 +3433,12 @@ void MDCache::rejoin_send_rejoins()
>>>         ++p) {
>>>      CDir *dir = p->first;
>>>      assert(dir->is_subtree_root());
>>> -    assert(!dir->is_ambiguous_dir_auth());
>>> +    if (dir->is_ambiguous_dir_auth()) {
>>> +      // exporter is recovering, importer is survivor.
>>
>> The importer has to be the MDS this code is running on, right?
>
> This code is for bystanders. The exporter is recovering, and its resolve message didn't claim
> the subtree. So the export must succeed.

Ah, yep. That's what I get for eyeing just the diff.

>
>>
>>> +      assert(rejoins.count(dir->authority().first));
>>> +      assert(!rejoins.count(dir->authority().second));
>>> +      continue;
>>> +    }
>>>
>>>      // my subtree?
>>>      if (dir->is_auth())
>>> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
>>> index 5e53803..833df12 100644
>>> --- a/src/mds/Migrator.cc
>>> +++ b/src/mds/Migrator.cc
>>> @@ -2088,6 +2088,23 @@ void Migrator::import_reverse(CDir *dir)
>>>    }
>>>  }
>>>
>>> +void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
>>> +{
>>> +  dout(7) << "import_notify_finish " << *dir << dendl;
>>> +
>>> +  for (set<int>::iterator p = import_bystanders[dir].begin();
>>> +       p != import_bystanders[dir].end();
>>> +       ++p) {
>>> +    MExportDirNotify *notify =
>>> +      new MExportDirNotify(dir->dirfrag(), false,
>>> +                          pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
>>> +                          pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
>>
>> I don't think this is quite right — we're notifying them that we've
>> just finished importing data from somebody, right? And so we know that
>> we're the auth node...
>
> Yes. In normal case, exporter notifies the bystanders. But if exporter crashes, the importer notifies
> the bystanders after it confirms ambiguous import succeeds.

Never mind — I had the semantic meaning of these pairs wrong.

Reviewed-by: Greg Farnum <greg@inktank.com>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 29/39] mds: avoid double auth pin for file recovery
  2013-03-21  3:20   ` Gregory Farnum
  2013-03-21  3:33     ` Yan, Zheng
@ 2013-03-21 21:58     ` Gregory Farnum
  1 sibling, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21 21:58 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Went over those mechanisms quickly but a bit more carefully; looks good.
Reviewed-by: Greg Farnum <greg@inktank.com>

On Wed, Mar 20, 2013 at 8:20 PM, Gregory Farnum <greg@inktank.com> wrote:
> This looks good on its face but I haven't had the chance to dig
> through the recovery queue stuff yet (it's on my list following some
> issues with recovery speed). How'd you run across this? If it's being
> added to the recovery queue multiple times I want to make sure we
> don't have some other machinery trying to dequeue it multiple times,
> or a single waiter which needs to be a list or something.
> -Greg
>
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>> ---
>>  src/mds/MDCache.cc | 6 ++++--
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> index 973a4d0..e9a79cd 100644
>> --- a/src/mds/MDCache.cc
>> +++ b/src/mds/MDCache.cc
>> @@ -5502,8 +5502,10 @@ void MDCache::_queue_file_recover(CInode *in)
>>    dout(15) << "_queue_file_recover " << *in << dendl;
>>    assert(in->is_auth());
>>    in->state_clear(CInode::STATE_NEEDSRECOVER);
>> -  in->state_set(CInode::STATE_RECOVERING);
>> -  in->auth_pin(this);
>> +  if (!in->state_test(CInode::STATE_RECOVERING)) {
>> +    in->state_set(CInode::STATE_RECOVERING);
>> +    in->auth_pin(this);
>> +  }
>>    file_recover_queue.insert(in);
>>  }
>>
>> --
>> 1.7.11.7
>>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack
  2013-03-21  6:41     ` Yan, Zheng
@ 2013-03-21 21:58       ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21 21:58 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Wed, Mar 20, 2013 at 11:41 PM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 03/21/2013 07:33 AM, Gregory Farnum wrote:
>> This needs to handle versioning the encoding based on peer feature bits too.
>>
>> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>>
>>> Cache rejoin ack message already encodes inode base, make it also encode
>>> dirfrag base. This allowes the message to replicate stray dentries like
>>> MDentryUnlink message. The function will be used by later patch.
>>>
>>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>>> ---
>>>  src/mds/CDir.h                 | 20 +++++++++++++-------
>>>  src/mds/MDCache.cc             | 20 ++++++++++++++++++--
>>>  src/messages/MMDSCacheRejoin.h | 12 +++++++++++-
>>>  3 files changed, 42 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/src/mds/CDir.h b/src/mds/CDir.h
>>> index 79946f1..f4a3a3d 100644
>>> --- a/src/mds/CDir.h
>>> +++ b/src/mds/CDir.h
>>> @@ -437,23 +437,29 @@ private:
>>>      ::encode(dist, bl);
>>>    }
>>>
>>> -  void encode_replica(int who, bufferlist& bl) {
>>> -    __u32 nonce = add_replica(who);
>>> -    ::encode(nonce, bl);
>>> +  void _encode_base(bufferlist& bl) {
>>>      ::encode(first, bl);
>>>      ::encode(fnode, bl);
>>>      ::encode(dir_rep, bl);
>>>      ::encode(dir_rep_by, bl);
>>>    }
>>> -  void decode_replica(bufferlist::iterator& p) {
>>> -    __u32 nonce;
>>> -    ::decode(nonce, p);
>>> -    replica_nonce = nonce;
>>> +  void _decode_base(bufferlist::iterator& p) {
>>>      ::decode(first, p);
>>>      ::decode(fnode, p);
>>>      ::decode(dir_rep, p);
>>>      ::decode(dir_rep_by, p);
>>>    }
>>> +  void encode_replica(int who, bufferlist& bl) {
>>> +    __u32 nonce = add_replica(who);
>>> +    ::encode(nonce, bl);
>>> +    _encode_base(bl);
>>> +  }
>>> +  void decode_replica(bufferlist::iterator& p) {
>>> +    __u32 nonce;
>>> +    ::decode(nonce, p);
>>> +    replica_nonce = nonce;
>>> +    _decode_base(p);
>>> +  }
>>>
>>>
>>>
>>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>>> index 8ba676e..344777e 100644
>>> --- a/src/mds/MDCache.cc
>>> +++ b/src/mds/MDCache.cc
>>> @@ -4510,8 +4510,22 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>>>      }
>>>    }
>>>
>>> +  // full dirfrags
>>> +  bufferlist::iterator p = ack->dirfrag_base.begin();
>>> +  while (!p.end()) {
>>> +    dirfrag_t df;
>>> +    bufferlist basebl;
>>> +    ::decode(df, p);
>>> +    ::decode(basebl, p);
>>> +    CDir *dir = get_dirfrag(df);
>>> +    assert(dir);
>>> +    bufferlist::iterator q = basebl.begin();
>>> +    dir->_decode_base(q);
>>> +    dout(10) << " got dir replica " << *dir << dendl;
>>> +  }
>>> +
>>>    // full inodes
>>> -  bufferlist::iterator p = ack->inode_base.begin();
>>> +  p = ack->inode_base.begin();
>>>    while (!p.end()) {
>>>      inodeno_t ino;
>>>      snapid_t last;
>>> @@ -5178,8 +5192,10 @@ void MDCache::rejoin_send_acks()
>>>        // dir
>>>        for (map<int,int>::iterator r = dir->replicas_begin();
>>>            r != dir->replicas_end();
>>> -          ++r)
>>> +          ++r) {
>>>         ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
>>> +       ack[r->first]->add_dirfrag_base(dir);
>>> +      }
>>>
>>>        for (CDir::map_t::iterator q = dir->items.begin();
>>>            q != dir->items.end();
>>> diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
>>> index b88f551..7c37ab4 100644
>>> --- a/src/messages/MMDSCacheRejoin.h
>>> +++ b/src/messages/MMDSCacheRejoin.h
>>> @@ -20,6 +20,7 @@
>>>  #include "include/types.h"
>>>
>>>  #include "mds/CInode.h"
>>> +#include "mds/CDir.h"
>>>
>>>  // sent from replica to auth
>>>
>>> @@ -169,6 +170,7 @@ class MMDSCacheRejoin : public Message {
>>>    // full
>>>    bufferlist inode_base;
>>>    bufferlist inode_locks;
>>> +  bufferlist dirfrag_base;
>>>
>>>    // authpins, xlocks
>>>    struct slave_reqid {
>>> @@ -258,7 +260,13 @@ public:
>>>    void add_strong_dirfrag(dirfrag_t df, int n, int dr) {
>>>      strong_dirfrags[df] = dirfrag_strong(n, dr);
>>>    }
>>> -
>>> +  void add_dirfrag_base(CDir *dir) {
>>> +    ::encode(dir->dirfrag(), dirfrag_base);
>>> +    bufferlist bl;
>>> +    dir->_encode_base(bl);
>>> +    ::encode(bl, dirfrag_base);
>>> +  }
>>
>> We are guilty of doing this in other places, but we should avoid
>> implicit encodings like this one, especially when the decode happens
>> somewhere else like it does here. We can make a vector dirfrag_bases
>> and add to that, and then encode and decode it along with the rest of
>> the message — would that work for your purposes?
>> -Greg
>>
>
> update this patch or send a new patch that updates both {inode,dirfrag}_base?
>
> Thanks
> Yan, Zheng

Updating this one is fine for me. :)
-Greg


>
>>> +
>>>    // dentries
>>>    void add_weak_dirfrag(dirfrag_t df) {
>>>      weak_dirfrags.insert(df);
>>> @@ -294,6 +302,7 @@ public:
>>>      ::encode(wrlocked_inodes, payload);
>>>      ::encode(cap_export_bl, payload);
>>>      ::encode(strong_dirfrags, payload);
>>> +    ::encode(dirfrag_base, payload);
>>>      ::encode(weak, payload);
>>>      ::encode(weak_dirfrags, payload);
>>>      ::encode(weak_inodes, payload);
>>> @@ -319,6 +328,7 @@ public:
>>>        ::decode(cap_export_paths, q);
>>>      }
>>>      ::decode(strong_dirfrags, p);
>>> +    ::decode(dirfrag_base, p);
>>>      ::decode(weak, p);
>>>      ::decode(weak_dirfrags, p);
>>>      ::decode(weak_inodes, p);
>>> --
>>> 1.7.11.7
>>>
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-21  8:07             ` Yan, Zheng
@ 2013-03-21 22:03               ` Gregory Farnum
  2013-03-25 11:30                 ` Yan, Zheng
  0 siblings, 1 reply; 117+ messages in thread
From: Gregory Farnum @ 2013-03-21 22:03 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Sage Weil, ceph-devel

On Thu, Mar 21, 2013 at 1:07 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 03/21/2013 02:31 AM, Greg Farnum wrote:
>> On Tuesday, March 19, 2013 at 11:49 PM, Yan, Zheng wrote:
>>> On 03/20/2013 02:15 PM, Sage Weil wrote:
>>>> On Wed, 20 Mar 2013, Yan, Zheng wrote:
>>>>> On 03/20/2013 07:09 AM, Greg Farnum wrote:
>>>>>> Hmm, this is definitely narrowing the race (probably enough to never hit it), but it's not actually eliminating it (if the restart happens after 4 billion requests?). More importantly this kind of symptom makes me worry that we might be papering over more serious issues with colliding states in the Table on restart.
>>>>>> I don't have the MDSTable semantics in my head so I'll need to look into this later unless somebody else volunteers to do so?
>>>>>
>>>>>
>>>>>
>>>>> Not just 4 billion requests, MDS restart has several stage, mdsmap epoch
>>>>> increases for each stage. I don't think there are any more colliding
>>>>> states in the table. The table client/server use two phase commit. it's
>>>>> similar to client request that involves multiple MDS. the reqid is
>>>>> analogy to client request id. The difference is client request ID is
>>>>> unique because new client always get an unique session id.
>>>>
>>>>
>>>>
>>>> Each time a tid is consumed (at least for an update) it is journaled in
>>>> the EMetaBlob::table_tids list, right? So we could actually take a max
>>>> from journal replay and pick up where we left off? That seems like the
>>>> cleanest.
>>>>
>>>> I'm not too worried about 2^32 tids, I guess, but it would be nicer to
>>>> avoid that possibility.
>>>
>>>
>>>
>>> Can we re-use the client request ID as table client request ID ?
>>>
>>> Regards
>>> Yan, Zheng
>>
>> Not sure what you're referring to here — do you mean the ID of the filesystem client request which prompted the update? I don't think that would work as client requests actually require two parts to be unique (the client GUID and the request seq number), and I'm pretty sure a single client request can spawn multiple Table updates.
>>
>
> You are right, client request ID does not work.
>
>> As I look over this more, it sure looks to me as if the effect of the code we have (when non-broken) is to rollback every non-committed request by an MDS which restarted — the only time it can handle the TableServer's "agree" with a different response is if the MDS was incorrectly marked out by the map. Am I parsing this correctly, Sage? Given that, and without having looked at the code more broadly, I think we want to add some sort of implicit or explicit handshake letting each of them know if the MDS actually disappeared. We use the process/address nonce to accomplish this in other places…
>> -Greg
>>
>
> The table server sends 'agree' message to table client after a 'prepare entry' is safely logged. The table server re-sends 'agree' message in two cases, one is the table client restarts, another is the table server itself restarts.
> The purpose of re-sending 'agree' message is to check if the table client still wants to keep the update preparation. (The table client might crash before submitting the update). The purpose of reqid is associate table update
> preparation request with the server's 'agree' reply message. The problem here is that the table client does not make sure reqid unique between restarts. If you feel 2^32 reqids are still enough, set the reqid to a randomized 64bit
> value should be safe enough.

Right. I'd like to somehow mark those reqid's so that we can tell when
they come from a different incarnation of the MDS TableClient daemon.
One way is via some piece of random data that will probably
distinguish them, although if we have something which we can know is
different that would be preferable. I think we can work something out
of the startup session data each MDS does with the monitors, but I'm
not sure I can check any time soon; I have a number of other things to
get to now that I've gotten through (the first round on) this series.

Thanks for all the patches, by the way. :)
-Greg
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 18/39] mds: fix MDS recovery involving cross authority rename
  2013-03-21 17:59   ` Gregory Farnum
@ 2013-03-22  3:04     ` Yan, Zheng
  2013-03-29 22:02       ` Gregory Farnum
  0 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-22  3:04 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: Sage Weil, ceph-devel

On 03/22/2013 01:59 AM, Gregory Farnum wrote:
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> For mds cluster, rename operation may involve multiple MDS. If the
>> rename source's auth MDS crashes after some witness MDS have prepared
>> the rename but before the rename is committing. Later when the MDS
>> recovers, its subtree map and linkages are different from the prepared
>> MDS'. This causes problems for both subtree resolve and cache rejoin.
>> The solution is, if the rename source's auth MDS fails, the prepared
>> witness MDS query the master MDS if the operation is committing. If
>> it's not, rollback the rename, then send resolve message to the
>> recovering MDS.
>>
>> Another similar case is a prepared witness MDS crashes when the
>> rename source's auth MDS has prepared or is preparing the operation.
>> when the witness recovers, the master just delay sending the resolve
>> ack message until the it commits the operation.
>>
>> This patch also updates Server::handle_client_rename(). Make preparing
>> the rename source's auth MDS be the final step before committing the
>> rename.
> 
> Why? It's not immediately obvious to me what the benefit is, and the
> commit message should state it. :)

For the second case, it's possible the recovering MDS is anchor server. The master delays
sending the resolve ack message until pending update is committed. To commit the pending
update, the master needs anchor server's preparation ack. The master and the anchor server
wait for each other.

> 
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>> ---
>>  src/mds/MDCache.cc |  75 +++++++++++++++++++++++++++++-----------
>>  src/mds/MDCache.h  |  17 +++++++--
>>  src/mds/Mutation.h |   2 ++
>>  src/mds/Server.cc  | 100 ++++++++++++++++++++++++++++-------------------------
>>  4 files changed, 124 insertions(+), 70 deletions(-)
>>
>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> index 9b37b1e..d934020 100644
>> --- a/src/mds/MDCache.cc
>> +++ b/src/mds/MDCache.cc
>> @@ -2491,7 +2491,7 @@ void MDCache::send_slave_resolves()
>>        if (!p->second->is_slave() || !p->second->slave_did_prepare())
>>         continue;
>>        int master = p->second->slave_to_mds;
>> -      if (resolve_set.count(master)) {
>> +      if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
>>         dout(10) << " including uncommitted " << *p->second << dendl;
>>         if (!resolves.count(master))
>>           resolves[master] = new MMDSResolve;
>> @@ -2610,6 +2610,7 @@ void MDCache::handle_mds_failure(int who)
>>
>>    resolve_gather.insert(who);
>>    discard_delayed_resolve(who);
>> +  ambiguous_slave_updates.erase(who);
>>
>>    rejoin_gather.insert(who);
>>    rejoin_sent.erase(who);        // i need to send another
>> @@ -2642,14 +2643,46 @@ void MDCache::handle_mds_failure(int who)
>>           finish.push_back(p->second);
>>        }
>>      }
>> +
>> +    if (p->second->is_slave() &&
>> +       p->second->slave_did_prepare() && p->second->more()->srcdn_auth_mds == who &&
>> +       mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
>> +      // rename srcdn's auth mds failed, resolve even I'm a survivor.
>> +      dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
>> +      add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
>> +    }
>>
>>      // failed node is slave?
>>      if (p->second->is_master() && !p->second->committing) {
>> +      if (p->second->more()->srcdn_auth_mds == who) {
>> +       dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
>> +                << who << " to recover" << dendl;
>> +       assert(p->second->more()->witnessed.count(who) == 0);
>> +       if (p->second->more()->is_ambiguous_auth)
>> +         p->second->clear_ambiguous_auth();
>> +       // rename srcdn's auth mds failed, all witnesses will rollback
>> +       p->second->more()->witnessed.clear();
>> +       pending_masters.erase(p->first);
>> +      }
>> +
>>        if (p->second->more()->witnessed.count(who)) {
>> -       dout(10) << " master request " << *p->second << " no longer witnessed by slave mds." << who
>> -                << dendl;
>> -       // discard this peer's prepare (if any)
>> -       p->second->more()->witnessed.erase(who);
>> +       int srcdn_auth = p->second->more()->srcdn_auth_mds;
>> +       if (srcdn_auth >= 0 && p->second->more()->waiting_on_slave.count(srcdn_auth)) {
>> +         dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
>> +                  << p->second->more()->srcdn_auth_mds << " to reply" << dendl;
>> +         // waiting for the last slave (rename srcdn's auth mds), delay sending resolve ack
>> +         // until either the request is committing or the last slave also fails.
>> +         assert(p->second->more()->waiting_on_slave.size() == 1);
>> +         pending_masters.insert(p->first);
> 
> The language about "last slave" is confusing me here — I'm with you
> that this rename should only have one slave, but I don't think it ever
> should have had more than one. Do you mean "only slave" or am I
> missing something?

Yes, I mean the 'only slave'. But the code 'more()->waiting_on_slave' also considers witnesses
also as slave, that's why I use 'last slave'. Will update the comment.

> 
>> +       } else {
>> +         dout(10) << " master request " << *p->second << " no longer witnessed by slave mds."
>> +                  << who << " to recover" << dendl;
>> +         if (srcdn_auth >= 0)
>> +           assert(p->second->more()->witnessed.count(srcdn_auth) == 0);
>> +
>> +         // discard this peer's prepare (if any)
>> +         p->second->more()->witnessed.erase(who);
>> +       }
>>        }
>>
>>        if (p->second->more()->waiting_on_slave.count(who)) {
>> @@ -2657,14 +2690,8 @@ void MDCache::handle_mds_failure(int who)
>>                  << " to recover" << dendl;
>>         // retry request when peer recovers
>>         p->second->more()->waiting_on_slave.erase(who);
>> -       mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
>> -      }
>> -
>> -      if (p->second->has_more() && p->second->more()->is_ambiguous_auth &&
>> -         p->second->more()->rename_inode->authority().first == who) {
>> -       dout(10) << " master request " << *p->second << " waiting for renamed inode's auth mds." << who
>> -                << " to recover" << dendl;
>> -       p->second->clear_ambiguous_auth();
> 
> Why are you getting rid of waiting for the renamed inode's MDS? I
> could be misremembering, but I believe we need it, and it might be
> different from the source or dest dentry auths.

The code is moved up. see above test "(p->second->more()->srcdn_auth_mds == who)"

> 
>> +       if (p->second->more()->waiting_on_slave.empty())
>> +         mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
>>        }
>>
>>        if (p->second->locking && p->second->locking_target_mds == who)
>> @@ -2951,16 +2978,27 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
>>    dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
>>    int from = ack->get_source().num();
>>
>> -  if (!resolve_ack_gather.count(from)) {
>> +  if (!resolve_ack_gather.count(from) ||
>> +      mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
>>      ack->put();
>>      return;
>>    }
>>
>> +  if (ambiguous_slave_updates.count(from)) {
>> +    assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
>> +    assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
>> +  }
>> +
>>    for (vector<metareqid_t>::iterator p = ack->commit.begin();
>>         p != ack->commit.end();
>>         ++p) {
>>      dout(10) << " commit on slave " << *p << dendl;
>>
>> +    if (ambiguous_slave_updates.count(from)) {
>> +      remove_ambiguous_slave_update(*p, from);
>> +      continue;
>> +    }
>> +
>>      if (mds->is_resolve()) {
>>        // replay
>>        MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
>> @@ -3020,13 +3058,8 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
>>      }
>>    }
>>
>> -  if (!mds->is_resolve()) {
>> -    for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
>> -       p != active_requests.end(); ++p)
>> -      assert(p->second->slave_to_mds != from);
>> -  }
>> -
>> -  resolve_ack_gather.erase(from);
>> +  if (!ambiguous_slave_updates.count(from))
>> +    resolve_ack_gather.erase(from);
>>    if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
>>      send_subtree_resolves();
>>      process_delayed_resolve();
>> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
>> index 8f262b9..a05ced7 100644
>> --- a/src/mds/MDCache.h
>> +++ b/src/mds/MDCache.h
>> @@ -327,9 +327,8 @@ protected:
>>    map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
>>
>>    set<metareqid_t>             pending_masters;
>> +  map<int, set<metareqid_t> >  ambiguous_slave_updates;
>>
>> -  //map<metareqid_t, bool>     ambiguous_slave_updates;         // for log trimming.
>> -  //map<metareqid_t, Context*> waiting_for_slave_update_commit;
>>    friend class ESlaveUpdate;
>>    friend class ECommitted;
>>
>> @@ -353,6 +352,20 @@ protected:
>>  public:
>>    void remove_inode_recursive(CInode *in);
>>
>> +  bool is_ambiguous_slave_update(metareqid_t reqid, int master) {
>> +    return ambiguous_slave_updates.count(master) &&
>> +          ambiguous_slave_updates[master].count(reqid);
>> +  }
>> +  void add_ambiguous_slave_update(metareqid_t reqid, int master) {
>> +    ambiguous_slave_updates[master].insert(reqid);
>> +  }
>> +  void remove_ambiguous_slave_update(metareqid_t reqid, int master) {
>> +    assert(ambiguous_slave_updates[master].count(reqid));
>> +    ambiguous_slave_updates[master].erase(reqid);
>> +    if (ambiguous_slave_updates[master].empty())
>> +      ambiguous_slave_updates.erase(master);
>> +  }
>> +
>>    void add_rollback(metareqid_t reqid, int master) {
>>      need_resolve_rollback[reqid] = master;
>>    }
>> diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
>> index 5013f04..de122a5 100644
>> --- a/src/mds/Mutation.h
>> +++ b/src/mds/Mutation.h
>> @@ -207,6 +207,7 @@ struct MDRequest : public Mutation {
>>
>>      // for rename
>>      set<int> extra_witnesses; // replica list from srcdn auth (rename)
>> +    int srcdn_auth_mds;
>>      version_t src_reanchor_atid;  // src->dst
>>      version_t dst_reanchor_atid;  // dst->stray
>>      bufferlist inode_import;
>> @@ -233,6 +234,7 @@ struct MDRequest : public Mutation {
>>      bufferlist rollback_bl;
>>
>>      More() :
>> +      srcdn_auth_mds(-1),
>>        src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
>>        rename_inode(0), is_freeze_authpin(false), is_ambiguous_auth(false),
>>        is_remote_frozen_authpin(false), is_inode_exporter(false),
>> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
>> index 1330f11..b6e5665 100644
>> --- a/src/mds/Server.cc
>> +++ b/src/mds/Server.cc
>> @@ -5772,12 +5772,52 @@ void Server::handle_client_rename(MDRequest *mdr)
>>    if (mdr->now == utime_t())
>>      mdr->now = ceph_clock_now(g_ceph_context);
>>
>> +  // -- prepare anchor updates --
>> +  if (!linkmerge || srcdnl->is_primary()) {
>> +    C_GatherBuilder anchorgather(g_ceph_context);
>> +
>> +    if (srcdnl->is_primary() &&
>> +      (srcdnl->get_inode()->is_anchored() ||
>> +       (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
>> +                                          srcdnl->get_inode()->nested_anchors ||
>> +                                          !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
>> +      !mdr->more()->src_reanchor_atid) {
>> +      dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
>> +      vector<Anchor> trace;
>> +      destdn->make_anchor_trace(trace, srcdnl->get_inode());
>> +      mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
>> +                                       trace, &mdr->more()->src_reanchor_atid,
>> +                                       anchorgather.new_sub());
>> +    }
>> +    if (destdnl->is_primary() &&
>> +       destdnl->get_inode()->is_anchored() &&
>> +       !mdr->more()->dst_reanchor_atid) {
>> +      dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
>> +
>> +      assert(straydn);
>> +      vector<Anchor> trace;
>> +      straydn->make_anchor_trace(trace, destdnl->get_inode());
>> +
>> +      mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
>> +                 &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
>> +    }
>> +
>> +    if (anchorgather.has_subs())  {
>> +      anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
>> +      anchorgather.activate();
>> +      return;  // waiting for anchor prepares
>> +    }
>> +
>> +    assert(g_conf->mds_kill_rename_at != 2);
>> +  }
>> +
>>    // -- prepare witnesses --
>>
>>    // do srcdn auth last
>>    int last = -1;
>>    if (!srcdn->is_auth()) {
>>      last = srcdn->authority().first;
>> +    mdr->more()->srcdn_auth_mds = last;
>>      // ask auth of srci to mark srci as ambiguous auth if more than two MDS
>>      // are involved in the rename operation.
>>      if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
>> @@ -5803,58 +5843,18 @@ void Server::handle_client_rename(MDRequest *mdr)
>>    if (!mdr->more()->waiting_on_slave.empty())
>>      return;  // we're waiting for a witness.
>>
>> -  if (last >= 0 &&
>> -      mdr->more()->witnessed.count(last) == 0 &&
>> -      mdr->more()->waiting_on_slave.count(last) == 0) {
>> +  if (last >= 0 && mdr->more()->witnessed.count(last) == 0) {
>>      dout(10) << " preparing last witness (srcdn auth)" << dendl;
>> +    assert(mdr->more()->waiting_on_slave.count(last) == 0);
>>      _rename_prepare_witness(mdr, last, witnesses, srcdn, destdn, straydn);
>>      return;
>>    }
>>
>>    // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
>>    if (!mdr->more()->slaves.empty() && !srci->is_dir())
>> -    assert(g_conf->mds_kill_rename_at != 2);
>> +    assert(g_conf->mds_kill_rename_at != 3);
>>    if (!mdr->more()->slaves.empty() && srci->is_dir())
>> -    assert(g_conf->mds_kill_rename_at != 3);
>> -
>> -  // -- prepare anchor updates --
>> -  if (!linkmerge || srcdnl->is_primary()) {
>> -    C_GatherBuilder anchorgather(g_ceph_context);
>> -
>> -    if (srcdnl->is_primary() &&
>> -       (srcdnl->get_inode()->is_anchored() ||
>> -        (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
>> -                                           srcdnl->get_inode()->nested_anchors ||
>> -                                           !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
>> -       !mdr->more()->src_reanchor_atid) {
>> -      dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
>> -      vector<Anchor> trace;
>> -      destdn->make_anchor_trace(trace, srcdnl->get_inode());
>> -      mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
>> -                                       trace, &mdr->more()->src_reanchor_atid,
>> -                                       anchorgather.new_sub());
>> -    }
>> -    if (destdnl->is_primary() &&
>> -       destdnl->get_inode()->is_anchored() &&
>> -       !mdr->more()->dst_reanchor_atid) {
>> -      dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
>> -
>> -      assert(straydn);
>> -      vector<Anchor> trace;
>> -      straydn->make_anchor_trace(trace, destdnl->get_inode());
>> -
>> -      mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
>> -                 &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
>> -    }
>> -
>> -    if (anchorgather.has_subs())  {
>> -      anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
>> -      anchorgather.activate();
>> -      return;  // waiting for anchor prepares
>> -    }
>> -
>>      assert(g_conf->mds_kill_rename_at != 4);
>> -  }
>>
>>    // -- prepare journal entry --
>>    mdr->ls = mdlog->get_current_segment();
>> @@ -6762,10 +6762,17 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
>>      // abort
>>      //  rollback_bl may be empty if we froze the inode but had to provide an expanded
>>      // witness list from the master, and they failed before we tried prep again.
>> -    if (mdr->more()->rollback_bl.length())
>> -      do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
>> -    else
>> +    if (mdr->more()->rollback_bl.length()) {
>> +      if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
>> +       mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
>> +       // rollback but preserve the slave request
>> +       do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL);
>> +      } else
>> +       do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
>> +    } else {
>>        dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
>> +      mds->mdcache->request_finish(mdr);
>> +    }
>>    }
>>  }
>>
>> @@ -6825,7 +6832,6 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
>>    dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
>>    // need to finish this update before sending resolve to claim the subtree
>>    mds->mdcache->add_rollback(rollback.reqid, master);
>> -  assert(mdr || mds->is_resolve());
>>
>>    Mutation *mut = new Mutation(rollback.reqid);
>>    mut->ls = mds->mdlog->get_current_segment();
>> --
>> 1.7.11.7
>>

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 22/39] mds: handle linkage mismatch during cache rejoin
  2013-03-21 21:23   ` Gregory Farnum
@ 2013-03-22  3:05     ` Yan, Zheng
  2013-03-25 16:14       ` Gregory Farnum
  2013-03-26  7:21     ` Yan, Zheng
  1 sibling, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-22  3:05 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: ceph-devel, Sage Weil

On 03/22/2013 05:23 AM, Gregory Farnum wrote:
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> For MDS cluster, not all file system namespace operations that impact
>> multiple MDS use two phase commit. Some operations use dentry link/unlink
>> message to update replica dentry's linkage after they are committed by
>> the master MDS. It's possible the master MDS crashes after journaling an
>> operation, but before sending the dentry link/unlink messages. Later when
>> the MDS recovers and receives cache rejoin messages from the surviving
>> MDS, it will find linkage mismatch.
> 
> I think you're here talking about link/unlink, and the MDS crashing
> after it's sent out the LogEvent to the OSD but it hasn't actually
> dispatched the observer slave requests. Is that right? This commit
> message really confused me; I was trying to figure out which namespace
> operations were hacking around a proper 2-phase commit by unlinking
> and relinking inodes into the tree! (The link/unlink code also is
> doing a 2-phase commit, it just doesn't force a particular order for
> the journaling, which was previously left unhandled).

I was talking about the cases that use MDCache::send_dentry_{link,unlink}
to update replica dentry. There are a lot of usage in Server.cc. 

> 
>>
>> The original cache rejoin code does not properly handle the case that
>> dentry unlink messages were missing. Unlinked inodes were linked to stray
>> dentries. So the cache rejoin ack message need push replicas of these
>> stray dentries to the surviving MDS.
>>
>> This patch also adds code that handles cache expiration in the middle of
>> cache rejoining.
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>> ---
>>  src/mds/MDCache.cc | 348 +++++++++++++++++++++++++++++++++++------------------
>>  src/mds/MDCache.h  |   1 +
>>  2 files changed, 233 insertions(+), 116 deletions(-)
>>
>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> index 344777e..38b1fdf 100644
>> --- a/src/mds/MDCache.cc
>> +++ b/src/mds/MDCache.cc
>> @@ -3536,7 +3536,6 @@ void MDCache::rejoin_send_rejoins()
>>      } else {
>>        // strong
>>        if (p->first == 0 && root) {
>> -       p->second->add_weak_inode(root->vino());
>>         p->second->add_strong_inode(root->vino(),
>>                                     root->get_replica_nonce(),
>>                                     root->get_caps_wanted(),
>> @@ -3550,7 +3549,6 @@ void MDCache::rejoin_send_rejoins()
>>        }
>>
>>        if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
>> -       p->second->add_weak_inode(in->vino());
>>         p->second->add_strong_inode(in->vino(),
>>                                     in->get_replica_nonce(),
>>                                     in->get_caps_wanted(),
>> @@ -3567,6 +3565,8 @@ void MDCache::rejoin_send_rejoins()
>>      for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
>>          p != active_requests.end();
>>          ++p) {
>> +      if ( p->second->is_slave())
>> +       continue;
>>        // auth pins
>>        for (set<MDSCacheObject*>::iterator q = p->second->remote_auth_pins.begin();
>>            q != p->second->remote_auth_pins.end();
>> @@ -4226,6 +4226,8 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>>      rejoin_potential_updated_scatterlocks.insert(in);
>>    }
>>
>> +  rejoin_unlinked_inodes[from].clear();
>> +
>>    // surviving peer may send incorrect dirfrag here (maybe they didn't
>>    // get the fragment notify, or maybe we rolled back?).  we need to
>>    // infer the right frag and get them with the program.  somehow.
>> @@ -4332,105 +4334,125 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>>
>>        dn->add_replica(from, q->second.nonce);
>>        dout(10) << " have " << *dn << dendl;
>> -
>> -      // inode?
>> -      if (dnl->is_primary()) {
>> -       CInode *in = dnl->get_inode();
>> -       assert(in);
>> -
>> -       if (strong->strong_inodes.count(in->vino())) {
>> -         MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->vino()];
>>
>> -         // caps_wanted
>> -         if (is.caps_wanted) {
>> -           in->mds_caps_wanted[from] = is.caps_wanted;
>> -           dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
>> -                    << " on " << *in << dendl;
>> -         }
>> -
>> -         // scatterlocks?
>> -         //  infer state from replica state:
>> -         //   * go to MIX if they might have wrlocks
>> -         //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
>> -         in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
>> -         in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
>> -         in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
>> -
>> -         // auth pin?
>> -         if (strong->authpinned_inodes.count(in->vino())) {
>> -           MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
>> -           dout(10) << " inode authpin by " << r << " on " << *in << dendl;
>> -
>> -           // get/create slave mdrequest
>> -           MDRequest *mdr;
>> -           if (have_request(r.reqid))
>> -             mdr = request_get(r.reqid);
>> -           else
>> -             mdr = request_start_slave(r.reqid, r.attempt, from);
>> -           if (strong->frozen_authpin_inodes.count(in->vino())) {
>> -             assert(!in->get_num_auth_pins());
>> -             mdr->freeze_auth_pin(in);
>> -           } else {
>> -             assert(!in->is_frozen_auth_pin());
>> -           }
>> -           mdr->auth_pin(in);
>> -         }
>> -         // xlock(s)?
>> -         if (strong->xlocked_inodes.count(in->vino())) {
>> -           for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->xlocked_inodes[in->vino()].begin();
>> -                r != strong->xlocked_inodes[in->vino()].end();
>> -                ++r) {
>> -             SimpleLock *lock = in->get_lock(r->first);
>> -             dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl;
>> -             MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
>> -             assert(mdr->is_auth_pinned(in));
>> -             if (lock->is_stable())
>> -               in->auth_pin(lock);
>> -             lock->set_state(LOCK_XLOCK);
>> -             if (lock == &in->filelock)
>> -               in->loner_cap = -1;
>> -             lock->get_xlock(mdr, mdr->get_client());
>> -             mdr->xlocks.insert(lock);
>> -             mdr->locks.insert(lock);
>> -           }
>> -         }
>> -         // wrlock(s)?
>> -         if (strong->wrlocked_inodes.count(in->vino())) {
>> -           for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->wrlocked_inodes[in->vino()].begin();
>> -                r != strong->wrlocked_inodes[in->vino()].end();
>> -                ++r) {
>> -             SimpleLock *lock = in->get_lock(r->first);
>> -             dout(10) << " inode wrlock by " << r->second << " on " << *lock << " on " << *in << dendl;
>> -             MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
>> -             assert(mdr->is_auth_pinned(in));
>> -             lock->set_state(LOCK_LOCK);
>> -             if (lock == &in->filelock)
>> -               in->loner_cap = -1;
>> -             lock->get_wrlock(true);
>> -             mdr->wrlocks.insert(lock);
>> -             mdr->locks.insert(lock);
>> -           }
>> +      if (dnl->is_primary()) {
>> +       if (q->second.is_primary()) {
>> +         if (!(vinodeno_t(q->second.ino, q->first.snapid) == dnl->get_inode()->vino())) {
> 
> Maybe it's worth adding an operator!= for vinodeno_t, since you seem
> to use this a couple times.
> 
>> +           // the survivor missed MDentryUnlink+MDentryLink messages ?
>> +           assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
>> +           CInode *in = get_inode(q->second.ino, q->first.snapid);
>> +           assert(in);
>> +           rejoin_unlinked_inodes[from].insert(in);
>> +           dout(7) << " sender has primary dentry but wrong inode" << dendl;
>>           }
>>         } else {
>> -         dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
>> +         // the survivor missed MDentryLink message ?
>> +         assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
>> +         dout(7) << " sender doesn't primay dentry" << dendl;
> 
> doesn't have primary? or something else?

will fix.

> 
>> +       }
>> +      } else {
>> +       if (q->second.is_primary()) {
>> +         // the survivor missed MDentryUnlink message ?
>> +         CInode *in = get_inode(q->second.ino, q->first.snapid);
>> +         assert(in);
>> +         rejoin_unlinked_inodes[from].insert(in);
>> +         dout(7) << " sender has primary dentry but we don't" << dendl;
>>         }
>> -
>> -       in->add_replica(from, p->second.nonce);
>> -       dout(10) << " have " << *in << dendl;
>>        }
>>      }
>>    }
>>
>> -  // base inodes?  (root, stray, etc.)
>> -  for (set<vinodeno_t>::iterator p = strong->weak_inodes.begin();
>> -       p != strong->weak_inodes.end();
>> +  for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
>> +       p != strong->strong_inodes.end();
>>         ++p) {
>> -    CInode *in = get_inode(*p);
>> -    dout(10) << " have base " << *in << dendl;
>> -    in->add_replica(from);
>> +    CInode *in = get_inode(p->first);
>> +    assert(in);
>> +    in->add_replica(from, p->second.nonce);
>> +    dout(10) << " have " << *in << dendl;
>> +
>> +    MMDSCacheRejoin::inode_strong &is = p->second;
>> +
>> +    // caps_wanted
>> +    if (is.caps_wanted) {
>> +      in->mds_caps_wanted[from] = is.caps_wanted;
>> +      dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
>> +              << " on " << *in << dendl;
>> +    }
>> +
>> +    // scatterlocks?
>> +    //  infer state from replica state:
>> +    //   * go to MIX if they might have wrlocks
>> +    //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
>> +    in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
>> +    in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
>> +    in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
>> +
>> +    // auth pin?
>> +    if (strong->authpinned_inodes.count(in->vino())) {
>> +      MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
>> +      dout(10) << " inode authpin by " << r << " on " << *in << dendl;
>> +
>> +      // get/create slave mdrequest
>> +      MDRequest *mdr;
>> +      if (have_request(r.reqid))
>> +       mdr = request_get(r.reqid);
>> +      else
>> +       mdr = request_start_slave(r.reqid, r.attempt, from);
>> +      if (strong->frozen_authpin_inodes.count(in->vino())) {
>> +       assert(!in->get_num_auth_pins());
>> +       mdr->freeze_auth_pin(in);
>> +      } else {
>> +       assert(!in->is_frozen_auth_pin());
>> +      }
>> +      mdr->auth_pin(in);
>> +    }
>> +    // xlock(s)?
>> +    if (strong->xlocked_inodes.count(in->vino())) {
>> +      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
>> +          q != strong->xlocked_inodes[in->vino()].end();
>> +          ++q) {
>> +       SimpleLock *lock = in->get_lock(q->first);
>> +       dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
>> +       MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
>> +       assert(mdr->is_auth_pinned(in));
>> +       if (lock->is_stable())
>> +         in->auth_pin(lock);
>> +       lock->set_state(LOCK_XLOCK);
>> +       if (lock == &in->filelock)
>> +         in->loner_cap = -1;
>> +       lock->get_xlock(mdr, mdr->get_client());
>> +       mdr->xlocks.insert(lock);
>> +       mdr->locks.insert(lock);
>> +      }
>> +    }
>> +    // wrlock(s)?
>> +    if (strong->wrlocked_inodes.count(in->vino())) {
>> +      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->wrlocked_inodes[in->vino()].begin();
>> +          q != strong->wrlocked_inodes[in->vino()].end();
>> +          ++q) {
>> +       SimpleLock *lock = in->get_lock(q->first);
>> +       dout(10) << " inode wrlock by " << q->second << " on " << *lock << " on " << *in << dendl;
>> +       MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
>> +       assert(mdr->is_auth_pinned(in));
>> +       lock->set_state(LOCK_LOCK);
>> +       if (lock == &in->filelock)
>> +         in->loner_cap = -1;
>> +       lock->get_wrlock(true);
>> +       mdr->wrlocks.insert(lock);
>> +       mdr->locks.insert(lock);
>> +      }
>> +    }
>>    }
>>
>> -
>> +  // unlinked inodes should be in stray
>> +  for (set<CInode*>::iterator p = rejoin_unlinked_inodes[from].begin();
>> +       p != rejoin_unlinked_inodes[from].end();
>> +       ++p) {
>> +    CInode *in = *p;
>> +    dout(7) << " unlinked inode " << *in << dendl;
>> +    assert(in->get_parent_dn());
>> +    assert(in->is_replica(from));
>> +  }
> 
> I'm not clear on why we need to check this here — the previous for
> loop wasn't adding any inodes to the cache, so shouldn't we just check
> these conditions as we add them?
> 
will update the code.

Thanks
Yan, Zheng

>>
>>    // done?
>>    assert(rejoin_gather.count(from));
>> @@ -4448,6 +4470,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>>    dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
>>    int from = ack->get_source().num();
>>
>> +  // for sending cache expire message
>> +  list<CInode*> isolated_inodes;
>> +
>>    // dirs
>>    for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
>>         p != ack->strong_dirfrags.end();
>> @@ -4455,7 +4480,29 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>>      // we may have had incorrect dir fragmentation; refragment based
>>      // on what they auth tells us.
>>      CDir *dir = get_force_dirfrag(p->first);
>> -    assert(dir);
>> +    if (!dir) {
>> +      CInode *diri = get_inode(p->first.ino);
>> +      if (!diri) {
>> +       // barebones inode; the full inode loop below will clean up.
>> +       diri = new CInode(this, false);
>> +       diri->inode.ino = p->first.ino;
>> +       diri->inode.mode = S_IFDIR;
>> +       if (MDS_INO_MDSDIR(p->first.ino)) {
>> +         diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN);
>> +         add_inode(diri);
>> +         dout(10) << " add inode " << *diri << dendl;
>> +       } else {
>> +         diri->inode_auth = CDIR_AUTH_UNDEF;
>> +         isolated_inodes.push_back(diri);
>> +         dout(10) << " unconnected dirfrag " << p->first << dendl;
>> +       }
>> +      }
>> +      // barebones dirfrag; the full dirfrag loop below will clean up.
>> +      dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
>> +      if (dir->authority().first != from)
>> +       adjust_subtree_auth(dir, from);
>> +      dout(10) << " add dirfrag " << *dir << dendl;
>> +    }
>>
>>      dir->set_replica_nonce(p->second.nonce);
>>      dir->state_clear(CDir::STATE_REJOINING);
>> @@ -4467,7 +4514,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>>          q != dmap.end();
>>          ++q) {
>>        CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
>> -      assert(dn);
>> +      if(!dn)
>> +       dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
>> +
>>        CDentry::linkage_t *dnl = dn->get_linkage();
>>
>>        assert(dn->last == q->first.snapid);
>> @@ -4476,33 +4525,48 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>>         dn->first = q->second.first;
>>        }
>>
>> +      // may have bad linkage if we missed dentry link/unlink messages
>> +      if (dnl->is_primary()) {
>> +       CInode *in = dnl->get_inode();
>> +       if (!q->second.is_primary() ||
>> +           !(vinodeno_t(q->second.ino, q->first.snapid) == in->vino())) {
>> +         dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
>> +         dir->unlink_inode(dn);
>> +       }
>> +      } else if (dnl->is_remote()) {
>> +       if (!q->second.is_remote() ||
>> +           q->second.remote_ino != dnl->get_remote_ino() ||
>> +           q->second.remote_d_type != dnl->get_remote_d_type()) {
>> +         dout(10) << " had bad linkage for " << *dn <<  dendl;
>> +         dir->unlink_inode(dn);
>> +       }
>> +      } else {
>> +       if (!q->second.is_null())
>> +         dout(10) << " had bad linkage for " << *dn <<  dendl;
>> +      }
>> +
>>        // hmm, did we have the proper linkage here?
>> -      if (dnl->is_null() &&
>> -         !q->second.is_null()) {
>> -       dout(10) << " had bad (missing) linkage for " << *dn << dendl;
>> +      if (dnl->is_null() && !q->second.is_null()) {
>>         if (q->second.is_remote()) {
>>           dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
>>         } else {
>>           CInode *in = get_inode(q->second.ino, q->first.snapid);
>> -         assert(in == 0);  // a rename would have been caught be the resolve stage.
>> -         // barebones inode; the full inode loop below will clean up.
>> -         in = new CInode(this, false, q->second.first, q->first.snapid);
>> -         in->inode.ino = q->second.ino;
>> -         add_inode(in);
>> +         if (!in) {
>> +           // barebones inode; assume it's dir, the full inode loop below will clean up.
>> +           in = new CInode(this, false, q->second.first, q->first.snapid);
>> +           in->inode.ino = q->second.ino;
>> +           in->inode.mode = S_IFDIR;
>> +           add_inode(in);
>> +           dout(10) << " add inode " << *in << dendl;
>> +         } else if (in->get_parent_dn()) {
>> +           dout(10) << " had bad linkage for " << *(in->get_parent_dn())
>> +                    << ", unlinking " << *in << dendl;
>> +           in->get_parent_dir()->unlink_inode(in->get_parent_dn());
>> +         }
>>           dn->dir->link_primary_inode(dn, in);
>>         }
>>        }
>> -      else if (!dnl->is_null() &&
>> -              q->second.is_null()) {
>> -       dout(0) << " had bad linkage for " << *dn << dendl;
>> -       /*
>> -        * this should happen:
>> -        *  if we're a survivor, any unlink should commit or rollback during
>> -        * the resolve stage.
>> -        *  if we failed, we shouldn't have non-auth leaf dentries at all
>> -        */
>> -       assert(0);  // uh oh.
>> -      }
>> +
>>        dn->set_replica_nonce(q->second.nonce);
>>        dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
>>        dn->state_clear(CDentry::STATE_REJOINING);
>> @@ -4564,6 +4628,21 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>>      dout(10) << " got inode locks " << *in << dendl;
>>    }
>>
>> +  // trim unconnected subtree
>> +  if (!isolated_inodes.empty()) {
>> +    map<int, MCacheExpire*> expiremap;
>> +    for (list<CInode*>::iterator p = isolated_inodes.begin();
>> +        p != isolated_inodes.end();
>> +        ++p) {
>> +      list<CDir*> ls;
>> +      (*p)->get_dirfrags(ls);
>> +      trim_dirfrag(*ls.begin(), 0, expiremap);
>> +      assert((*p)->get_num_ref() == 0);
>> +      delete *p;
>> +    }
>> +    send_expire_messages(expiremap);
>> +  }
>> +
>>    // done?
>>    assert(rejoin_ack_gather.count(from));
>>    rejoin_ack_gather.erase(from);
>> @@ -5164,6 +5243,37 @@ void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snap
>>  void MDCache::rejoin_send_acks()
>>  {
>>    dout(7) << "rejoin_send_acks" << dendl;
>> +
>> +  // replicate stray
>> +  for (map<int, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
>> +       p != rejoin_unlinked_inodes.end();
>> +       ++p) {
>> +    for (set<CInode*>::iterator q = p->second.begin();
>> +        q != p->second.end();
>> +        ++q) {
>> +      CInode *in = *q;
>> +      dout(7) << " unlinked inode " << *in << dendl;
>> +      // inode expired
>> +      if (!in->is_replica(p->first))
>> +       continue;
>> +      while (1) {
>> +       CDentry *dn = in->get_parent_dn();
>> +       if (dn->is_replica(p->first))
>> +         break;
>> +       dn->add_replica(p->first);
>> +       CDir *dir = dn->get_dir();
>> +       if (dir->is_replica(p->first))
>> +         break;
>> +       dir->add_replica(p->first);
>> +       in = dir->get_inode();
>> +       if (in->is_replica(p->first))
>> +         break;
>> +       if (in->is_base())
>> +         break;
>> +      }
>> +    }
>> +  }
>> +  rejoin_unlinked_inodes.clear();
>>
>>    // send acks to everyone in the recovery set
>>    map<int,MMDSCacheRejoin*> ack;
>> @@ -5203,23 +5313,29 @@ void MDCache::rejoin_send_acks()
>>         CDentry *dn = q->second;
>>         CDentry::linkage_t *dnl = dn->get_linkage();
>>
>> +       // inode
>> +       CInode *in = NULL;
>> +       if (dnl->is_primary())
>> +         in = dnl->get_inode();
>> +
>>         // dentry
>>         for (map<int,int>::iterator r = dn->replicas_begin();
>>              r != dn->replicas_end();
>> -            ++r)
>> +            ++r) {
>>           ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
>>                                            dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
>>                                            dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
>>                                            dnl->is_remote() ? dnl->get_remote_d_type():0,
>>                                            ++r->second,
>>                                            dn->lock.get_replica_state());
>> +         // peer missed MDentrylink message ?
>> +         if (in && !in->is_replica(r->first))
>> +           in->add_replica(r->first);
>> +       }
>>
>> -       if (!dnl->is_primary())
>> +       if (!in)
>>           continue;
>>
>> -       // inode
>> -       CInode *in = dnl->get_inode();
>> -
>>         for (map<int,int>::iterator r = in->replicas_begin();
>>              r != in->replicas_end();
>>              ++r) {
>> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
>> index 85f5d65..09cc092 100644
>> --- a/src/mds/MDCache.h
>> +++ b/src/mds/MDCache.h
>> @@ -416,6 +416,7 @@ protected:
>>    set<CInode*> rejoin_undef_inodes;
>>    set<CInode*> rejoin_potential_updated_scatterlocks;
>>    set<CDir*>   rejoin_undef_dirfrags;
>> +  map<int, set<CInode*> > rejoin_unlinked_inodes;
>>
>>    vector<CInode*> rejoin_recover_q, rejoin_check_q;
>>    list<Context*> rejoin_waiters;
>> --
>> 1.7.11.7
>>

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-21 22:03               ` Gregory Farnum
@ 2013-03-25 11:30                 ` Yan, Zheng
  2013-03-29 22:12                   ` Gregory Farnum
  0 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-25 11:30 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: Sage Weil, ceph-devel

On 03/22/2013 06:03 AM, Gregory Farnum wrote:
> Right. I'd like to somehow mark those reqid's so that we can tell when
> they come from a different incarnation of the MDS TableClient daemon.
> One way is via some piece of random data that will probably
> distinguish them, although if we have something which we can know is
> different that would be preferable. I think we can work something out
> of the startup session data each MDS does with the monitors, but I'm
> not sure I can check any time soon; I have a number of other things to
> get to now that I've gotten through (the first round on) this series.
> 

How about the attached patch?

Thanks

----
commit d460b766e16ec2cacac239a74af0e226108ab95a
Author: Yan, Zheng <zheng.z.yan@intel.com>
Date:   Sat Mar 16 08:02:18 2013 +0800

    mds: make sure table request id unique
    
    When a MDS becomes active, the table server re-sends 'agree' messages
    for old prepared request. If the recoverd MDS starts a new table request
    at the same time, The new request's ID can happen to be the same as old
    prepared request's ID, because current table client code assigns request
    ID from zero after MDS restarts.
    
    This patch make table server send 'ready' messages when table clients
    become active or itself becomes active. The 'ready' message updates
    table client's last_reqid to avoid request ID collision. The message
    also replaces the roles of finish_recovery() and handle_mds_recovery()
    callbacks for table client.
    
    Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>

diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index bb1c833..834a7aa 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1508,14 +1508,13 @@ void MDS::recovery_done()
   
   // kick anchortable (resent AGREEs)
   if (mdsmap->get_tableserver() == whoami) {
-    anchorserver->finish_recovery();
-    snapserver->finish_recovery();
+    set<int> active;
+    mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
+    mdsmap->get_mds_set(active, MDSMap::STATE_STOPPING);
+    anchorserver->finish_recovery(active);
+    snapserver->finish_recovery(active);
   }
-  
-  // kick anchorclient (resent COMMITs)
-  anchorclient->finish_recovery();
-  snapclient->finish_recovery();
-  
+
   mdcache->start_recovered_truncates();
   mdcache->do_file_recover();
 
@@ -1537,8 +1536,6 @@ void MDS::handle_mds_recovery(int who)
     anchorserver->handle_mds_recovery(who);
     snapserver->handle_mds_recovery(who);
   }
-  anchorclient->handle_mds_recovery(who);
-  snapclient->handle_mds_recovery(who);
   
   queue_waiters(waiting_for_active_peer[who]);
   waiting_for_active_peer.erase(who);
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
index ea021f5..29e172b 100644
--- a/src/mds/MDSTableClient.cc
+++ b/src/mds/MDSTableClient.cc
@@ -101,6 +101,16 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
     }
     break;
 
+  case TABLESERVER_OP_SERVER_READY:
+    if (last_reqid == 0) {
+      assert(reqid > 0);
+      last_reqid = reqid;
+    }
+
+    resend_queries();
+    resend_prepares();
+    resend_commits();
+    break;
   default:
     assert(0);
   }
@@ -126,19 +136,23 @@ void MDSTableClient::_logged_ack(version_t tid)
 void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl,
 			      Context *onfinish)
 {
-  uint64_t reqid = ++last_reqid;
-  dout(10) << "_prepare " << reqid << dendl;
-
-  // send message
-  MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
-  req->bl = mutation;
-
-  pending_prepare[reqid].mutation = mutation;
-  pending_prepare[reqid].ptid = ptid;
-  pending_prepare[reqid].pbl = pbl;
-  pending_prepare[reqid].onfinish = onfinish;
-
-  send_to_tableserver(req);
+  if (last_reqid > 0) {
+    uint64_t reqid = ++last_reqid;
+    dout(10) << "_prepare " << reqid << dendl;
+    // send message
+    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
+    req->bl = mutation;
+
+    pending_prepare[reqid].mutation = mutation;
+    pending_prepare[reqid].ptid = ptid;
+    pending_prepare[reqid].pbl = pbl;
+    pending_prepare[reqid].onfinish = onfinish;
+
+    send_to_tableserver(req);
+  } else {
+    dout(10) << "table server is not ready yet, waiting" << dendl;
+    waiting_for_server.push_back(_pending_prepare(onfinish, ptid, pbl, mutation));
+  }
 }
 
 void MDSTableClient::send_to_tableserver(MMDSTableRequest *req)
@@ -176,6 +190,7 @@ void MDSTableClient::got_journaled_agree(version_t tid, LogSegment *ls)
   ls->pending_commit_tids[table].insert(tid);
   pending_commit[tid] = ls;
 }
+
 void MDSTableClient::got_journaled_ack(version_t tid)
 {
   dout(10) << "got_journaled_ack " << tid << dendl;
@@ -185,12 +200,6 @@ void MDSTableClient::got_journaled_ack(version_t tid)
   }
 }
 
-void MDSTableClient::finish_recovery()
-{
-  dout(7) << "finish_recovery" << dendl;
-  resend_commits();
-}
-
 void MDSTableClient::resend_commits()
 {
   for (map<version_t,LogSegment*>::iterator p = pending_commit.begin();
@@ -202,24 +211,18 @@ void MDSTableClient::resend_commits()
   }
 }
 
-void MDSTableClient::handle_mds_recovery(int who)
+void MDSTableClient::resend_prepares()
 {
-  dout(7) << "handle_mds_recovery mds." << who << dendl;
-
-  if (who != mds->mdsmap->get_tableserver()) 
-    return; // do nothing.
-
-  resend_queries();
-  
-  // prepares.
+  while (!waiting_for_server.empty()) {
+    pending_prepare[++last_reqid] = waiting_for_server.front();
+    waiting_for_server.pop_front();
+  }
   for (map<uint64_t, _pending_prepare>::iterator p = pending_prepare.begin();
        p != pending_prepare.end();
        ++p) {
-    dout(10) << "resending " << p->first << dendl;
+    dout(10) << "resending prepare on " << p->first << dendl;
     MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, p->first);
     req->bl = p->second.mutation;
     mds->send_message_mds(req, mds->mdsmap->get_tableserver());
-  } 
-
-  resend_commits();
+  }
 }
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
index e15837f..7638260 100644
--- a/src/mds/MDSTableClient.h
+++ b/src/mds/MDSTableClient.h
@@ -38,9 +38,12 @@ protected:
     bufferlist mutation;
 
     _pending_prepare() : onfinish(0), ptid(0), pbl(0) {}
+    _pending_prepare(Context *c, version_t *pt, bufferlist *pb, bufferlist& m) :
+      onfinish(c), ptid(pt), pbl(pb), mutation(m) {}
   };
 
   map<uint64_t, _pending_prepare> pending_prepare;
+  list<_pending_prepare> waiting_for_server;
 
   // pending commits
   map<version_t, LogSegment*> pending_commit;
@@ -68,9 +71,8 @@ public:
   void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, Context *onfinish);
   void commit(version_t tid, LogSegment *ls);
 
-  // for recovery (by other nodes)
-  void handle_mds_recovery(int mds); // called when someone else recovers
   void resend_commits();
+  void resend_prepares();
 
   // for recovery (by me)
   void got_journaled_agree(version_t tid, LogSegment *ls);
@@ -82,7 +84,6 @@ public:
   void wait_for_ack(version_t tid, Context *c) {
     ack_waiters[tid].push_back(c);
   }
-  void finish_recovery();                // called when i recover and go active
 
   void send_to_tableserver(MMDSTableRequest *req);
 
diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
index 4f86ff1..e56e2b4 100644
--- a/src/mds/MDSTableServer.cc
+++ b/src/mds/MDSTableServer.cc
@@ -144,24 +144,30 @@ void MDSTableServer::do_server_update(bufferlist& bl)
 
 // recovery
 
-void MDSTableServer::finish_recovery()
+void MDSTableServer::finish_recovery(set<int>& active)
 {
   dout(7) << "finish_recovery" << dendl;
-  handle_mds_recovery(-1);  // resend agrees for everyone.
+  for (set<int>::iterator p = active.begin(); p != active.end(); ++p)
+    handle_mds_recovery(*p);  // resend agrees for everyone.
 }
 
 void MDSTableServer::handle_mds_recovery(int who)
 {
-  if (who >= 0)
-    dout(7) << "handle_mds_recovery mds." << who << dendl;
-  
+  dout(7) << "handle_mds_recovery mds." << who << dendl;
+
+  uint64_t next_reqid = 1;
   // resend agrees for recovered mds
   for (map<version_t,mds_table_pending_t>::iterator p = pending_for_mds.begin();
        p != pending_for_mds.end();
        ++p) {
-    if (who >= 0 && p->second.mds != who)
+    if (p->second.mds != who)
       continue;
+    if (p->second.reqid >= next_reqid)
+      next_reqid = p->second.reqid + 1;
     MMDSTableRequest *reply = new MMDSTableRequest(table, TABLESERVER_OP_AGREE, p->second.reqid, p->second.tid);
-    mds->send_message_mds(reply, p->second.mds);
+    mds->send_message_mds(reply, who);
   }
+
+  MMDSTableRequest *reply = new MMDSTableRequest(table, TABLESERVER_OP_SERVER_READY, next_reqid);
+  mds->send_message_mds(reply, who);
 }
diff --git a/src/mds/MDSTableServer.h b/src/mds/MDSTableServer.h
index 26cd594..55827e7 100644
--- a/src/mds/MDSTableServer.h
+++ b/src/mds/MDSTableServer.h
@@ -90,7 +90,7 @@ private:
   }
 
   // recovery
-  void finish_recovery();
+  void finish_recovery(set<int>& active);
   void handle_mds_recovery(int who);
 };
 
diff --git a/src/mds/mds_table_types.h b/src/mds/mds_table_types.h
index b094c75..c08519a 100644
--- a/src/mds/mds_table_types.h
+++ b/src/mds/mds_table_types.h
@@ -39,6 +39,7 @@ enum {
   TABLESERVER_OP_ACK          = -6,
   TABLESERVER_OP_ROLLBACK     =  7,
   TABLESERVER_OP_SERVER_UPDATE = 8,
+  TABLESERVER_OP_SERVER_READY = -9,
 };
 
 inline const char *get_mdstableserver_opname(int op) {
@@ -51,6 +52,7 @@ inline const char *get_mdstableserver_opname(int op) {
   case TABLESERVER_OP_ACK: return "ack";
   case TABLESERVER_OP_ROLLBACK: return "rollback";
   case TABLESERVER_OP_SERVER_UPDATE: return "server_update";
+  case TABLESERVER_OP_SERVER_READY: return "server_ready";
   default: assert(0); return 0;
   }
 };


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* Re: [PATCH 22/39] mds: handle linkage mismatch during cache rejoin
  2013-03-22  3:05     ` Yan, Zheng
@ 2013-03-25 16:14       ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-25 16:14 UTC (permalink / raw)
  To: Yan, Zheng, Sage Weil; +Cc: ceph-devel

On Thu, Mar 21, 2013 at 8:05 PM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 03/22/2013 05:23 AM, Gregory Farnum wrote:
>> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>>
>>> For MDS cluster, not all file system namespace operations that impact
>>> multiple MDS use two phase commit. Some operations use dentry link/unlink
>>> message to update replica dentry's linkage after they are committed by
>>> the master MDS. It's possible the master MDS crashes after journaling an
>>> operation, but before sending the dentry link/unlink messages. Later when
>>> the MDS recovers and receives cache rejoin messages from the surviving
>>> MDS, it will find linkage mismatch.
>>
>> I think you're here talking about link/unlink, and the MDS crashing
>> after it's sent out the LogEvent to the OSD but it hasn't actually
>> dispatched the observer slave requests. Is that right? This commit
>> message really confused me; I was trying to figure out which namespace
>> operations were hacking around a proper 2-phase commit by unlinking
>> and relinking inodes into the tree! (The link/unlink code also is
>> doing a 2-phase commit, it just doesn't force a particular order for
>> the journaling, which was previously left unhandled).
>
> I was talking about the cases that use MDCache::send_dentry_{link,unlink}
> to update replica dentry. There are a lot of usage in Server.cc.

Ah, we do this on mknod and openc as well as an explicit client link,
which makes sense. I think I was reading more into your statement than
was actually there.
-Greg

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 22/39] mds: handle linkage mismatch during cache rejoin
  2013-03-21 21:23   ` Gregory Farnum
  2013-03-22  3:05     ` Yan, Zheng
@ 2013-03-26  7:21     ` Yan, Zheng
  2013-03-29 22:09       ` Gregory Farnum
  1 sibling, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-03-26  7:21 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: ceph-devel, Sage Weil

Updated update

Thanks
Yan, Zheng
------
From c1d3576556f5ad2849d3079845dc26ef7612e8d3 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Thu, 14 Mar 2013 20:06:27 +0800
Subject: [PATCH 22/39] mds: handle linkage mismatch during cache rejoin

For MDS cluster, not all file system namespace operations that impact
multiple MDS use two phase commit. Some operations use dentry link/unlink
message to update replica dentry's linkage after they are committed by
the master MDS. It's possible the master MDS crashes after journaling an
operation, but before sending the dentry link/unlink messages. Later when
the MDS recovers and receives cache rejoin messages from the surviving
MDS, it will find linkage mismatch.

The original cache rejoin code does not properly handle the case that
dentry unlink messages were missing. Unlinked inodes were linked to stray
dentries. So the cache rejoin ack message need push replicas of these
stray dentries to the surviving MDS.

This patch also adds code that handles cache expiration in the middle of
cache rejoining.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc | 342 +++++++++++++++++++++++++++++++++++------------------
 src/mds/MDCache.h  |   1 +
 src/mds/mdstypes.h |   3 +
 3 files changed, 229 insertions(+), 117 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 8edb11c..0d2cac1 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3540,7 +3540,6 @@ void MDCache::rejoin_send_rejoins()
     } else {
       // strong
       if (p->first == 0 && root) {
-	p->second->add_weak_inode(root->vino());
 	p->second->add_strong_inode(root->vino(),
 				    root->get_replica_nonce(),
 				    root->get_caps_wanted(),
@@ -3554,7 +3553,6 @@ void MDCache::rejoin_send_rejoins()
       }
 
       if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
-	p->second->add_weak_inode(in->vino());
 	p->second->add_strong_inode(in->vino(),
 				    in->get_replica_nonce(),
 				    in->get_caps_wanted(),
@@ -3571,6 +3569,8 @@ void MDCache::rejoin_send_rejoins()
     for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
 	 p != active_requests.end();
 	 ++p) {
+      if ( p->second->is_slave())
+	continue;
       // auth pins
       for (set<MDSCacheObject*>::iterator q = p->second->remote_auth_pins.begin();
 	   q != p->second->remote_auth_pins.end();
@@ -4230,6 +4230,8 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
     rejoin_potential_updated_scatterlocks.insert(in);
   }
 
+  rejoin_unlinked_inodes[from].clear();
+
   // surviving peer may send incorrect dirfrag here (maybe they didn't
   // get the fragment notify, or maybe we rolled back?).  we need to
   // infer the right frag and get them with the program.  somehow.
@@ -4336,106 +4338,118 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 
       dn->add_replica(from, q->second.nonce);
       dout(10) << " have " << *dn << dendl;
-      
-      // inode?
-      if (dnl->is_primary()) {
-	CInode *in = dnl->get_inode();
-	assert(in);
-
-	if (strong->strong_inodes.count(in->vino())) {
-	  MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->vino()];
 
-	  // caps_wanted
-	  if (is.caps_wanted) {
-	    in->mds_caps_wanted[from] = is.caps_wanted;
-	    dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
-		     << " on " << *in << dendl;
-	  } 
-	  
-	  // scatterlocks?
-	  //  infer state from replica state:
-	  //   * go to MIX if they might have wrlocks
-	  //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
-	  in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
-	  in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
-	  in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
-	  
-	  // auth pin?
-	  if (strong->authpinned_inodes.count(in->vino())) {
-	    MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
-	    dout(10) << " inode authpin by " << r << " on " << *in << dendl;
-	    
-	    // get/create slave mdrequest
-	    MDRequest *mdr;
-	    if (have_request(r.reqid))
-	      mdr = request_get(r.reqid);
-	    else
-	      mdr = request_start_slave(r.reqid, r.attempt, from);
-	    if (strong->frozen_authpin_inodes.count(in->vino())) {
-	      assert(!in->get_num_auth_pins());
-	      mdr->freeze_auth_pin(in);
-	    } else {
-	      assert(!in->is_frozen_auth_pin());
-	    }
-	    mdr->auth_pin(in);
-	  }
-	  // xlock(s)?
-	  if (strong->xlocked_inodes.count(in->vino())) {
-	    for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->xlocked_inodes[in->vino()].begin();
-		 r != strong->xlocked_inodes[in->vino()].end();
-		 ++r) {
-	      SimpleLock *lock = in->get_lock(r->first);
-	      dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl;
-	      MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
-	      assert(mdr->is_auth_pinned(in));
-	      if (lock->is_stable())
-		in->auth_pin(lock);
-	      lock->set_state(LOCK_XLOCK);
-	      if (lock == &in->filelock)
-		in->loner_cap = -1;
-	      lock->get_xlock(mdr, mdr->get_client());
-	      mdr->xlocks.insert(lock);
-	      mdr->locks.insert(lock);
-	    }
-	  }
-	  // wrlock(s)?
-	  if (strong->wrlocked_inodes.count(in->vino())) {
-	    for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->wrlocked_inodes[in->vino()].begin();
-		 r != strong->wrlocked_inodes[in->vino()].end();
-		 ++r) {
-	      SimpleLock *lock = in->get_lock(r->first);
-	      dout(10) << " inode wrlock by " << r->second << " on " << *lock << " on " << *in << dendl;
-	      MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
-	      assert(mdr->is_auth_pinned(in));
-	      lock->set_state(LOCK_LOCK);
-	      if (lock == &in->filelock)
-		in->loner_cap = -1;
-	      lock->get_wrlock(true);
-	      mdr->wrlocks.insert(lock);
-	      mdr->locks.insert(lock);
-	    }
+      if (dnl->is_primary()) {
+	if (q->second.is_primary()) {
+	  if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
+	    // the survivor missed MDentryUnlink+MDentryLink messages ?
+	    assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+	    CInode *in = get_inode(q->second.ino, q->first.snapid);
+	    assert(in);
+	    assert(in->get_parent_dn());
+	    rejoin_unlinked_inodes[from].insert(in);
+	    dout(7) << " sender has primary dentry but wrong inode" << dendl;
 	  }
 	} else {
-	  dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
+	  // the survivor missed MDentryLink message ?
+	  assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
+	  dout(7) << " sender doesn't have primay dentry" << dendl;
+	}
+      } else {
+	if (q->second.is_primary()) {
+	  // the survivor missed MDentryUnlink message ?
+	  CInode *in = get_inode(q->second.ino, q->first.snapid);
+	  assert(in);
+	  assert(in->get_parent_dn());
+	  rejoin_unlinked_inodes[from].insert(in);
+	  dout(7) << " sender has primary dentry but we don't" << dendl;
 	}
-	
-	in->add_replica(from, p->second.nonce);
-	dout(10) << " have " << *in << dendl;
       }
     }
   }
 
-  // base inodes?  (root, stray, etc.)
-  for (set<vinodeno_t>::iterator p = strong->weak_inodes.begin();
-       p != strong->weak_inodes.end();
+  for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
+       p != strong->strong_inodes.end();
        ++p) {
-    CInode *in = get_inode(*p);
-    dout(10) << " have base " << *in << dendl;
-    in->add_replica(from);
+    CInode *in = get_inode(p->first);
+    assert(in);
+    in->add_replica(from, p->second.nonce);
+    dout(10) << " have " << *in << dendl;
+
+    MMDSCacheRejoin::inode_strong &is = p->second;
+
+    // caps_wanted
+    if (is.caps_wanted) {
+      in->mds_caps_wanted[from] = is.caps_wanted;
+      dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
+	       << " on " << *in << dendl;
+    }
+
+    // scatterlocks?
+    //  infer state from replica state:
+    //   * go to MIX if they might have wrlocks
+    //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
+    in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
+    in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
+    in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
+
+    // auth pin?
+    if (strong->authpinned_inodes.count(in->vino())) {
+      MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
+      dout(10) << " inode authpin by " << r << " on " << *in << dendl;
+
+      // get/create slave mdrequest
+      MDRequest *mdr;
+      if (have_request(r.reqid))
+	mdr = request_get(r.reqid);
+      else
+	mdr = request_start_slave(r.reqid, r.attempt, from);
+      if (strong->frozen_authpin_inodes.count(in->vino())) {
+	assert(!in->get_num_auth_pins());
+	mdr->freeze_auth_pin(in);
+      } else {
+	assert(!in->is_frozen_auth_pin());
+      }
+      mdr->auth_pin(in);
+    }
+    // xlock(s)?
+    if (strong->xlocked_inodes.count(in->vino())) {
+      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
+	   q != strong->xlocked_inodes[in->vino()].end();
+	   ++q) {
+	SimpleLock *lock = in->get_lock(q->first);
+	dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
+	MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
+	assert(mdr->is_auth_pinned(in));
+	if (lock->is_stable())
+	  in->auth_pin(lock);
+	lock->set_state(LOCK_XLOCK);
+	if (lock == &in->filelock)
+	  in->loner_cap = -1;
+	lock->get_xlock(mdr, mdr->get_client());
+	mdr->xlocks.insert(lock);
+	mdr->locks.insert(lock);
+      }
+    }
+    // wrlock(s)?
+    if (strong->wrlocked_inodes.count(in->vino())) {
+      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->wrlocked_inodes[in->vino()].begin();
+	   q != strong->wrlocked_inodes[in->vino()].end();
+	   ++q) {
+	SimpleLock *lock = in->get_lock(q->first);
+	dout(10) << " inode wrlock by " << q->second << " on " << *lock << " on " << *in << dendl;
+	MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
+	assert(mdr->is_auth_pinned(in));
+	lock->set_state(LOCK_LOCK);
+	if (lock == &in->filelock)
+	  in->loner_cap = -1;
+	lock->get_wrlock(true);
+	mdr->wrlocks.insert(lock);
+	mdr->locks.insert(lock);
+      }
+    }
   }
 
-
-
   // done?
   assert(rejoin_gather.count(from));
   rejoin_gather.erase(from);
@@ -4452,6 +4466,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
   dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
   int from = ack->get_source().num();
 
+  // for sending cache expire message
+  list<CInode*> isolated_inodes;
+
   // dirs
   for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
        p != ack->strong_dirfrags.end();
@@ -4459,7 +4476,29 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
     // we may have had incorrect dir fragmentation; refragment based
     // on what they auth tells us.
     CDir *dir = get_force_dirfrag(p->first);
-    assert(dir);
+    if (!dir) {
+      CInode *diri = get_inode(p->first.ino);
+      if (!diri) {
+	// barebones inode; the full inode loop below will clean up.
+	diri = new CInode(this, false);
+	diri->inode.ino = p->first.ino;
+	diri->inode.mode = S_IFDIR;
+	if (MDS_INO_MDSDIR(p->first.ino)) {
+	  diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN);
+	  add_inode(diri);
+	  dout(10) << " add inode " << *diri << dendl;
+	} else {
+	  diri->inode_auth = CDIR_AUTH_UNDEF;
+	  isolated_inodes.push_back(diri);
+	  dout(10) << " unconnected dirfrag " << p->first << dendl;
+	}
+      }
+      // barebones dirfrag; the full dirfrag loop below will clean up.
+      dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
+      if (dir->authority().first != from)
+	adjust_subtree_auth(dir, from);
+      dout(10) << " add dirfrag " << *dir << dendl;
+    }
 
     dir->set_replica_nonce(p->second.nonce);
     dir->state_clear(CDir::STATE_REJOINING);
@@ -4471,7 +4510,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
 	 q != dmap.end();
 	 ++q) {
       CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
-      assert(dn);
+      if(!dn)
+	dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
+
       CDentry::linkage_t *dnl = dn->get_linkage();
 
       assert(dn->last == q->first.snapid);
@@ -4480,33 +4521,48 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
 	dn->first = q->second.first;
       }
 
+      // may have bad linkage if we missed dentry link/unlink messages
+      if (dnl->is_primary()) {
+	CInode *in = dnl->get_inode();
+	if (!q->second.is_primary() ||
+	    vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
+	  dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
+	  dir->unlink_inode(dn);
+	}
+      } else if (dnl->is_remote()) {
+	if (!q->second.is_remote() ||
+	    q->second.remote_ino != dnl->get_remote_ino() ||
+	    q->second.remote_d_type != dnl->get_remote_d_type()) {
+	  dout(10) << " had bad linkage for " << *dn <<  dendl;
+	  dir->unlink_inode(dn);
+	}
+      } else {
+	if (!q->second.is_null())
+	  dout(10) << " had bad linkage for " << *dn <<  dendl;
+      }
+
       // hmm, did we have the proper linkage here?
-      if (dnl->is_null() &&
-	  !q->second.is_null()) {
-	dout(10) << " had bad (missing) linkage for " << *dn << dendl;
+      if (dnl->is_null() && !q->second.is_null()) {
 	if (q->second.is_remote()) {
 	  dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
 	} else {
 	  CInode *in = get_inode(q->second.ino, q->first.snapid);
-	  assert(in == 0);  // a rename would have been caught be the resolve stage.
-	  // barebones inode; the full inode loop below will clean up.
-	  in = new CInode(this, false, q->second.first, q->first.snapid);
-	  in->inode.ino = q->second.ino;
-	  add_inode(in);
+	  if (!in) {
+	    // barebones inode; assume it's dir, the full inode loop below will clean up.
+	    in = new CInode(this, false, q->second.first, q->first.snapid);
+	    in->inode.ino = q->second.ino;
+	    in->inode.mode = S_IFDIR;
+	    add_inode(in);
+	    dout(10) << " add inode " << *in << dendl;
+	  } else if (in->get_parent_dn()) {
+	    dout(10) << " had bad linkage for " << *(in->get_parent_dn())
+		     << ", unlinking " << *in << dendl;
+	    in->get_parent_dir()->unlink_inode(in->get_parent_dn());
+	  }
 	  dn->dir->link_primary_inode(dn, in);
 	}
       }
-      else if (!dnl->is_null() &&
-	       q->second.is_null()) {
-	dout(0) << " had bad linkage for " << *dn << dendl;
-	/* 
-	 * this should happen:
-	 *  if we're a survivor, any unlink should commit or rollback during
-	 * the resolve stage.
-	 *  if we failed, we shouldn't have non-auth leaf dentries at all
-	 */
-	assert(0);  // uh oh.	
-      }
+
       dn->set_replica_nonce(q->second.nonce);
       dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
       dn->state_clear(CDentry::STATE_REJOINING);
@@ -4565,6 +4621,21 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
     dout(10) << " got inode locks " << *in << dendl;
   }
 
+  // trim unconnected subtree
+  if (!isolated_inodes.empty()) {
+    map<int, MCacheExpire*> expiremap;
+    for (list<CInode*>::iterator p = isolated_inodes.begin();
+	 p != isolated_inodes.end();
+	 ++p) {
+      list<CDir*> ls;
+      (*p)->get_dirfrags(ls);
+      trim_dirfrag(*ls.begin(), 0, expiremap);
+      assert((*p)->get_num_ref() == 0);
+      delete *p;
+    }
+    send_expire_messages(expiremap);
+  }
+
   // done?
   assert(rejoin_ack_gather.count(from));
   rejoin_ack_gather.erase(from);
@@ -5165,6 +5236,37 @@ void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snap
 void MDCache::rejoin_send_acks()
 {
   dout(7) << "rejoin_send_acks" << dendl;
+
+  // replicate stray
+  for (map<int, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
+       p != rejoin_unlinked_inodes.end();
+       ++p) {
+    for (set<CInode*>::iterator q = p->second.begin();
+	 q != p->second.end();
+	 ++q) {
+      CInode *in = *q;
+      dout(7) << " unlinked inode " << *in << dendl;
+      // inode expired
+      if (!in->is_replica(p->first))
+	continue;
+      while (1) {
+	CDentry *dn = in->get_parent_dn();
+	if (dn->is_replica(p->first))
+	  break;
+	dn->add_replica(p->first);
+	CDir *dir = dn->get_dir();
+	if (dir->is_replica(p->first))
+	  break;
+	dir->add_replica(p->first);
+	in = dir->get_inode();
+	if (in->is_replica(p->first))
+	  break;
+	if (in->is_base())
+	  break;
+      }
+    }
+  }
+  rejoin_unlinked_inodes.clear();
   
   // send acks to everyone in the recovery set
   map<int,MMDSCacheRejoin*> ack;
@@ -5204,23 +5306,29 @@ void MDCache::rejoin_send_acks()
 	CDentry *dn = q->second;
 	CDentry::linkage_t *dnl = dn->get_linkage();
 
+	// inode
+	CInode *in = NULL;
+	if (dnl->is_primary())
+	  in = dnl->get_inode();
+
 	// dentry
 	for (map<int,int>::iterator r = dn->replicas_begin();
 	     r != dn->replicas_end();
-	     ++r) 
+	     ++r) {
 	  ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
 					   dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
 					   dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
 					   dnl->is_remote() ? dnl->get_remote_d_type():0,
 					   ++r->second,
 					   dn->lock.get_replica_state());
+	  // peer missed MDentrylink message ?
+	  if (in && !in->is_replica(r->first))
+	    in->add_replica(r->first);
+	}
 	
-	if (!dnl->is_primary())
+	if (!in)
 	  continue;
 
-	// inode
-	CInode *in = dnl->get_inode();
-
 	for (map<int,int>::iterator r = in->replicas_begin();
 	     r != in->replicas_end();
 	     ++r) {
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 2a65d0a..73780e2 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -416,6 +416,7 @@ protected:
   set<CInode*> rejoin_undef_inodes;
   set<CInode*> rejoin_potential_updated_scatterlocks;
   set<CDir*>   rejoin_undef_dirfrags;
+  map<int, set<CInode*> > rejoin_unlinked_inodes;
 
   vector<CInode*> rejoin_recover_q, rejoin_check_q;
   list<Context*> rejoin_waiters;
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index cf5c1a7..aa9d165 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -235,6 +235,9 @@ WRITE_CLASS_ENCODER(vinodeno_t)
 inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
   return l.ino == r.ino && l.snapid == r.snapid;
 }
+inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
+  return !(l == r);
+}
 inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
   return 
     l.ino < r.ino ||
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* Re: [PATCH 05/39] mds: send table request when peer is in proper state.
  2013-03-17 14:51 ` [PATCH 05/39] mds: send table request when peer is in proper state Yan, Zheng
  2013-03-20 18:34   ` Greg Farnum
@ 2013-03-29 21:58   ` Gregory Farnum
  1 sibling, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-29 21:58 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Table client/server should send request/reply when the peer is active.
> Anchor query is an exception, because MDS in rejoin stage may need
> fetch files before sending rejoin ack, the anchor server can also be
> in rejoin stage.

Since this patch doesn't touch the TableServer can you remove those
references from the commit message? The rest looks good.
Reviewed-by: Greg Farnum <greg@inktank.com>

>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/AnchorClient.cc   | 5 ++++-
>  src/mds/MDSTableClient.cc | 9 ++++++---
>  src/mds/MDSTableServer.cc | 3 ++-
>  3 files changed, 12 insertions(+), 5 deletions(-)
>
> diff --git a/src/mds/AnchorClient.cc b/src/mds/AnchorClient.cc
> index 455e97f..d7da9d1 100644
> --- a/src/mds/AnchorClient.cc
> +++ b/src/mds/AnchorClient.cc
> @@ -80,9 +80,12 @@ void AnchorClient::lookup(inodeno_t ino, vector<Anchor>& trace, Context *onfinis
>
>  void AnchorClient::_lookup(inodeno_t ino)
>  {
> +  int ts = mds->mdsmap->get_tableserver();
> +  if (mds->mdsmap->get_state(ts) < MDSMap::STATE_REJOIN)
> +    return;
>    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_QUERY, 0, 0);
>    ::encode(ino, req->bl);
> -  mds->send_message_mds(req, mds->mdsmap->get_tableserver());
> +  mds->send_message_mds(req, ts);
>  }
>
>
> diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
> index beba0a3..df0131f 100644
> --- a/src/mds/MDSTableClient.cc
> +++ b/src/mds/MDSTableClient.cc
> @@ -149,9 +149,10 @@ void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist
>  void MDSTableClient::send_to_tableserver(MMDSTableRequest *req)
>  {
>    int ts = mds->mdsmap->get_tableserver();
> -  if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY)
> +  if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY) {
>      mds->send_message_mds(req, ts);
> -  else {
> +  } else {
> +    req->put();
>      dout(10) << " deferring request to not-yet-active tableserver mds." << ts << dendl;
>    }
>  }
> @@ -193,7 +194,9 @@ void MDSTableClient::got_journaled_ack(version_t tid)
>  void MDSTableClient::finish_recovery()
>  {
>    dout(7) << "finish_recovery" << dendl;
> -  resend_commits();
> +  int ts = mds->mdsmap->get_tableserver();
> +  if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY)
> +    resend_commits();
>  }
>
>  void MDSTableClient::resend_commits()
> diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
> index 4f86ff1..07c7d26 100644
> --- a/src/mds/MDSTableServer.cc
> +++ b/src/mds/MDSTableServer.cc
> @@ -159,7 +159,8 @@ void MDSTableServer::handle_mds_recovery(int who)
>    for (map<version_t,mds_table_pending_t>::iterator p = pending_for_mds.begin();
>         p != pending_for_mds.end();
>         ++p) {
> -    if (who >= 0 && p->second.mds != who)
> +    if ((who >= 0 && p->second.mds != who) ||
> +       mds->mdsmap->get_state(p->second.mds) < MDSMap::STATE_CLIENTREPLAY)
>        continue;
>      MMDSTableRequest *reply = new MMDSTableRequest(table, TABLESERVER_OP_AGREE, p->second.reqid, p->second.tid);
>      mds->send_message_mds(reply, p->second.mds);
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 06/39] mds: make table client/server tolerate duplicated message
  2013-03-17 14:51 ` [PATCH 06/39] mds: make table client/server tolerate duplicated message Yan, Zheng
@ 2013-03-29 22:00   ` Gregory Farnum
  2013-03-31 13:21     ` Yan, Zheng
  0 siblings, 1 reply; 117+ messages in thread
From: Gregory Farnum @ 2013-03-29 22:00 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

I believe this patch has been outdated thanks to the tid exchange
you're doing now, right?
-Greg

On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> Anchor client re-sends queries when the anchor server becomes active.
> So it's possible to get duplicated query reply.
>
> When the table server recovers, the clients re-send commits to the
> server, the server re-sends 'agree' messages to the clients. When
> the clients receive the 'agree' messages, they may send another
> commit/rollback message to the server.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/AnchorClient.cc   |  4 +++-
>  src/mds/AnchorServer.cc   |  6 ++++--
>  src/mds/MDSTableServer.cc | 22 ++++++++++++++++------
>  3 files changed, 23 insertions(+), 9 deletions(-)
>
> diff --git a/src/mds/AnchorClient.cc b/src/mds/AnchorClient.cc
> index d7da9d1..bcc8710 100644
> --- a/src/mds/AnchorClient.cc
> +++ b/src/mds/AnchorClient.cc
> @@ -41,7 +41,9 @@ void AnchorClient::handle_query_result(class MMDSTableRequest *m)
>    ::decode(ino, p);
>    ::decode(trace, p);
>
> -  assert(pending_lookup.count(ino));
> +  if (!pending_lookup.count(ino))
> +    return;
> +
>    list<_pending_lookup> ls;
>    ls.swap(pending_lookup[ino]);
>    pending_lookup.erase(ino);
> diff --git a/src/mds/AnchorServer.cc b/src/mds/AnchorServer.cc
> index 6f37e53..594bf7b 100644
> --- a/src/mds/AnchorServer.cc
> +++ b/src/mds/AnchorServer.cc
> @@ -213,10 +213,12 @@ bool AnchorServer::check_pending(version_t tid, MMDSTableRequest *req, list<Cont
>        ++p;
>      }
>      assert(p != pending.end());
> -    assert(p->second == NULL);
>      // not the earliest pending operation, wait if it's a commit
>      if (req) {
> -      p->second = new C_MDS_RetryMessage(mds, req);
> +      if (p->second == NULL)
> +       p->second = new C_MDS_RetryMessage(mds, req);
> +      else
> +       req->put(); // duplicated commit
>        return false;
>      }
>    }
> diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
> index 07c7d26..730606f 100644
> --- a/src/mds/MDSTableServer.cc
> +++ b/src/mds/MDSTableServer.cc
> @@ -120,15 +120,25 @@ void MDSTableServer::_commit_logged(MMDSTableRequest *req)
>  void MDSTableServer::handle_rollback(MMDSTableRequest *req)
>  {
>    dout(7) << "handle_rollback " << *req << dendl;
> -  _rollback(req->get_tid());
> -  _note_rollback(req->get_tid());
> -  mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_ROLLBACK, 0, -1,
> -                                                 req->get_tid(), version));
> +
> +  version_t tid = req->get_tid();
> +  if (pending_for_mds.count(tid)) {
> +    _rollback(tid);
> +    _note_rollback(tid);
> +    mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_ROLLBACK, 0, -1,
> +         tid, version));
> +  } else if (tid <= version) {
> +    dout(0) << "got rollback for tid " << tid << " <= " << version
> +           << ", already rollbacked or committed." << dendl;
> +  }
> +  else {
> +    // wtf.
> +    dout(0) << "got rollbacked for tid " << tid << " > " << version << dendl;
> +    assert(tid <= version);
> +  }
>    req->put();
>  }
>
> -
> -
>  // SERVER UPDATE
>
>  void MDSTableServer::do_server_update(bufferlist& bl)
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 18/39] mds: fix MDS recovery involving cross authority rename
  2013-03-22  3:04     ` Yan, Zheng
@ 2013-03-29 22:02       ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-29 22:02 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Sage Weil, ceph-devel

Yep, this all looks good in your tree now.
Reviewed-by: Greg Farnum <greg@inktank.com>

On Thu, Mar 21, 2013 at 8:04 PM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 03/22/2013 01:59 AM, Gregory Farnum wrote:
>> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>>
>>> For mds cluster, rename operation may involve multiple MDS. If the
>>> rename source's auth MDS crashes after some witness MDS have prepared
>>> the rename but before the rename is committing. Later when the MDS
>>> recovers, its subtree map and linkages are different from the prepared
>>> MDS'. This causes problems for both subtree resolve and cache rejoin.
>>> The solution is, if the rename source's auth MDS fails, the prepared
>>> witness MDS query the master MDS if the operation is committing. If
>>> it's not, rollback the rename, then send resolve message to the
>>> recovering MDS.
>>>
>>> Another similar case is a prepared witness MDS crashes when the
>>> rename source's auth MDS has prepared or is preparing the operation.
>>> when the witness recovers, the master just delay sending the resolve
>>> ack message until the it commits the operation.
>>>
>>> This patch also updates Server::handle_client_rename(). Make preparing
>>> the rename source's auth MDS be the final step before committing the
>>> rename.
>>
>> Why? It's not immediately obvious to me what the benefit is, and the
>> commit message should state it. :)
>
> For the second case, it's possible the recovering MDS is anchor server. The master delays
> sending the resolve ack message until pending update is committed. To commit the pending
> update, the master needs anchor server's preparation ack. The master and the anchor server
> wait for each other.
>
>>
>>>
>>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>>> ---
>>>  src/mds/MDCache.cc |  75 +++++++++++++++++++++++++++++-----------
>>>  src/mds/MDCache.h  |  17 +++++++--
>>>  src/mds/Mutation.h |   2 ++
>>>  src/mds/Server.cc  | 100 ++++++++++++++++++++++++++++-------------------------
>>>  4 files changed, 124 insertions(+), 70 deletions(-)
>>>
>>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>>> index 9b37b1e..d934020 100644
>>> --- a/src/mds/MDCache.cc
>>> +++ b/src/mds/MDCache.cc
>>> @@ -2491,7 +2491,7 @@ void MDCache::send_slave_resolves()
>>>        if (!p->second->is_slave() || !p->second->slave_did_prepare())
>>>         continue;
>>>        int master = p->second->slave_to_mds;
>>> -      if (resolve_set.count(master)) {
>>> +      if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
>>>         dout(10) << " including uncommitted " << *p->second << dendl;
>>>         if (!resolves.count(master))
>>>           resolves[master] = new MMDSResolve;
>>> @@ -2610,6 +2610,7 @@ void MDCache::handle_mds_failure(int who)
>>>
>>>    resolve_gather.insert(who);
>>>    discard_delayed_resolve(who);
>>> +  ambiguous_slave_updates.erase(who);
>>>
>>>    rejoin_gather.insert(who);
>>>    rejoin_sent.erase(who);        // i need to send another
>>> @@ -2642,14 +2643,46 @@ void MDCache::handle_mds_failure(int who)
>>>           finish.push_back(p->second);
>>>        }
>>>      }
>>> +
>>> +    if (p->second->is_slave() &&
>>> +       p->second->slave_did_prepare() && p->second->more()->srcdn_auth_mds == who &&
>>> +       mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
>>> +      // rename srcdn's auth mds failed, resolve even I'm a survivor.
>>> +      dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
>>> +      add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
>>> +    }
>>>
>>>      // failed node is slave?
>>>      if (p->second->is_master() && !p->second->committing) {
>>> +      if (p->second->more()->srcdn_auth_mds == who) {
>>> +       dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
>>> +                << who << " to recover" << dendl;
>>> +       assert(p->second->more()->witnessed.count(who) == 0);
>>> +       if (p->second->more()->is_ambiguous_auth)
>>> +         p->second->clear_ambiguous_auth();
>>> +       // rename srcdn's auth mds failed, all witnesses will rollback
>>> +       p->second->more()->witnessed.clear();
>>> +       pending_masters.erase(p->first);
>>> +      }
>>> +
>>>        if (p->second->more()->witnessed.count(who)) {
>>> -       dout(10) << " master request " << *p->second << " no longer witnessed by slave mds." << who
>>> -                << dendl;
>>> -       // discard this peer's prepare (if any)
>>> -       p->second->more()->witnessed.erase(who);
>>> +       int srcdn_auth = p->second->more()->srcdn_auth_mds;
>>> +       if (srcdn_auth >= 0 && p->second->more()->waiting_on_slave.count(srcdn_auth)) {
>>> +         dout(10) << " master request " << *p->second << " waiting for rename srcdn's auth mds."
>>> +                  << p->second->more()->srcdn_auth_mds << " to reply" << dendl;
>>> +         // waiting for the last slave (rename srcdn's auth mds), delay sending resolve ack
>>> +         // until either the request is committing or the last slave also fails.
>>> +         assert(p->second->more()->waiting_on_slave.size() == 1);
>>> +         pending_masters.insert(p->first);
>>
>> The language about "last slave" is confusing me here — I'm with you
>> that this rename should only have one slave, but I don't think it ever
>> should have had more than one. Do you mean "only slave" or am I
>> missing something?
>
> Yes, I mean the 'only slave'. But the code 'more()->waiting_on_slave' also considers witnesses
> also as slave, that's why I use 'last slave'. Will update the comment.
>
>>
>>> +       } else {
>>> +         dout(10) << " master request " << *p->second << " no longer witnessed by slave mds."
>>> +                  << who << " to recover" << dendl;
>>> +         if (srcdn_auth >= 0)
>>> +           assert(p->second->more()->witnessed.count(srcdn_auth) == 0);
>>> +
>>> +         // discard this peer's prepare (if any)
>>> +         p->second->more()->witnessed.erase(who);
>>> +       }
>>>        }
>>>
>>>        if (p->second->more()->waiting_on_slave.count(who)) {
>>> @@ -2657,14 +2690,8 @@ void MDCache::handle_mds_failure(int who)
>>>                  << " to recover" << dendl;
>>>         // retry request when peer recovers
>>>         p->second->more()->waiting_on_slave.erase(who);
>>> -       mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
>>> -      }
>>> -
>>> -      if (p->second->has_more() && p->second->more()->is_ambiguous_auth &&
>>> -         p->second->more()->rename_inode->authority().first == who) {
>>> -       dout(10) << " master request " << *p->second << " waiting for renamed inode's auth mds." << who
>>> -                << " to recover" << dendl;
>>> -       p->second->clear_ambiguous_auth();
>>
>> Why are you getting rid of waiting for the renamed inode's MDS? I
>> could be misremembering, but I believe we need it, and it might be
>> different from the source or dest dentry auths.
>
> The code is moved up. see above test "(p->second->more()->srcdn_auth_mds == who)"
>
>>
>>> +       if (p->second->more()->waiting_on_slave.empty())
>>> +         mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
>>>        }
>>>
>>>        if (p->second->locking && p->second->locking_target_mds == who)
>>> @@ -2951,16 +2978,27 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
>>>    dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
>>>    int from = ack->get_source().num();
>>>
>>> -  if (!resolve_ack_gather.count(from)) {
>>> +  if (!resolve_ack_gather.count(from) ||
>>> +      mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
>>>      ack->put();
>>>      return;
>>>    }
>>>
>>> +  if (ambiguous_slave_updates.count(from)) {
>>> +    assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
>>> +    assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
>>> +  }
>>> +
>>>    for (vector<metareqid_t>::iterator p = ack->commit.begin();
>>>         p != ack->commit.end();
>>>         ++p) {
>>>      dout(10) << " commit on slave " << *p << dendl;
>>>
>>> +    if (ambiguous_slave_updates.count(from)) {
>>> +      remove_ambiguous_slave_update(*p, from);
>>> +      continue;
>>> +    }
>>> +
>>>      if (mds->is_resolve()) {
>>>        // replay
>>>        MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
>>> @@ -3020,13 +3058,8 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
>>>      }
>>>    }
>>>
>>> -  if (!mds->is_resolve()) {
>>> -    for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
>>> -       p != active_requests.end(); ++p)
>>> -      assert(p->second->slave_to_mds != from);
>>> -  }
>>> -
>>> -  resolve_ack_gather.erase(from);
>>> +  if (!ambiguous_slave_updates.count(from))
>>> +    resolve_ack_gather.erase(from);
>>>    if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
>>>      send_subtree_resolves();
>>>      process_delayed_resolve();
>>> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
>>> index 8f262b9..a05ced7 100644
>>> --- a/src/mds/MDCache.h
>>> +++ b/src/mds/MDCache.h
>>> @@ -327,9 +327,8 @@ protected:
>>>    map<metareqid_t, umaster>                 uncommitted_masters;         // master: req -> slave set
>>>
>>>    set<metareqid_t>             pending_masters;
>>> +  map<int, set<metareqid_t> >  ambiguous_slave_updates;
>>>
>>> -  //map<metareqid_t, bool>     ambiguous_slave_updates;         // for log trimming.
>>> -  //map<metareqid_t, Context*> waiting_for_slave_update_commit;
>>>    friend class ESlaveUpdate;
>>>    friend class ECommitted;
>>>
>>> @@ -353,6 +352,20 @@ protected:
>>>  public:
>>>    void remove_inode_recursive(CInode *in);
>>>
>>> +  bool is_ambiguous_slave_update(metareqid_t reqid, int master) {
>>> +    return ambiguous_slave_updates.count(master) &&
>>> +          ambiguous_slave_updates[master].count(reqid);
>>> +  }
>>> +  void add_ambiguous_slave_update(metareqid_t reqid, int master) {
>>> +    ambiguous_slave_updates[master].insert(reqid);
>>> +  }
>>> +  void remove_ambiguous_slave_update(metareqid_t reqid, int master) {
>>> +    assert(ambiguous_slave_updates[master].count(reqid));
>>> +    ambiguous_slave_updates[master].erase(reqid);
>>> +    if (ambiguous_slave_updates[master].empty())
>>> +      ambiguous_slave_updates.erase(master);
>>> +  }
>>> +
>>>    void add_rollback(metareqid_t reqid, int master) {
>>>      need_resolve_rollback[reqid] = master;
>>>    }
>>> diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
>>> index 5013f04..de122a5 100644
>>> --- a/src/mds/Mutation.h
>>> +++ b/src/mds/Mutation.h
>>> @@ -207,6 +207,7 @@ struct MDRequest : public Mutation {
>>>
>>>      // for rename
>>>      set<int> extra_witnesses; // replica list from srcdn auth (rename)
>>> +    int srcdn_auth_mds;
>>>      version_t src_reanchor_atid;  // src->dst
>>>      version_t dst_reanchor_atid;  // dst->stray
>>>      bufferlist inode_import;
>>> @@ -233,6 +234,7 @@ struct MDRequest : public Mutation {
>>>      bufferlist rollback_bl;
>>>
>>>      More() :
>>> +      srcdn_auth_mds(-1),
>>>        src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
>>>        rename_inode(0), is_freeze_authpin(false), is_ambiguous_auth(false),
>>>        is_remote_frozen_authpin(false), is_inode_exporter(false),
>>> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
>>> index 1330f11..b6e5665 100644
>>> --- a/src/mds/Server.cc
>>> +++ b/src/mds/Server.cc
>>> @@ -5772,12 +5772,52 @@ void Server::handle_client_rename(MDRequest *mdr)
>>>    if (mdr->now == utime_t())
>>>      mdr->now = ceph_clock_now(g_ceph_context);
>>>
>>> +  // -- prepare anchor updates --
>>> +  if (!linkmerge || srcdnl->is_primary()) {
>>> +    C_GatherBuilder anchorgather(g_ceph_context);
>>> +
>>> +    if (srcdnl->is_primary() &&
>>> +      (srcdnl->get_inode()->is_anchored() ||
>>> +       (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
>>> +                                          srcdnl->get_inode()->nested_anchors ||
>>> +                                          !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
>>> +      !mdr->more()->src_reanchor_atid) {
>>> +      dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
>>> +      vector<Anchor> trace;
>>> +      destdn->make_anchor_trace(trace, srcdnl->get_inode());
>>> +      mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
>>> +                                       trace, &mdr->more()->src_reanchor_atid,
>>> +                                       anchorgather.new_sub());
>>> +    }
>>> +    if (destdnl->is_primary() &&
>>> +       destdnl->get_inode()->is_anchored() &&
>>> +       !mdr->more()->dst_reanchor_atid) {
>>> +      dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
>>> +
>>> +      assert(straydn);
>>> +      vector<Anchor> trace;
>>> +      straydn->make_anchor_trace(trace, destdnl->get_inode());
>>> +
>>> +      mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
>>> +                 &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
>>> +    }
>>> +
>>> +    if (anchorgather.has_subs())  {
>>> +      anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
>>> +      anchorgather.activate();
>>> +      return;  // waiting for anchor prepares
>>> +    }
>>> +
>>> +    assert(g_conf->mds_kill_rename_at != 2);
>>> +  }
>>> +
>>>    // -- prepare witnesses --
>>>
>>>    // do srcdn auth last
>>>    int last = -1;
>>>    if (!srcdn->is_auth()) {
>>>      last = srcdn->authority().first;
>>> +    mdr->more()->srcdn_auth_mds = last;
>>>      // ask auth of srci to mark srci as ambiguous auth if more than two MDS
>>>      // are involved in the rename operation.
>>>      if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
>>> @@ -5803,58 +5843,18 @@ void Server::handle_client_rename(MDRequest *mdr)
>>>    if (!mdr->more()->waiting_on_slave.empty())
>>>      return;  // we're waiting for a witness.
>>>
>>> -  if (last >= 0 &&
>>> -      mdr->more()->witnessed.count(last) == 0 &&
>>> -      mdr->more()->waiting_on_slave.count(last) == 0) {
>>> +  if (last >= 0 && mdr->more()->witnessed.count(last) == 0) {
>>>      dout(10) << " preparing last witness (srcdn auth)" << dendl;
>>> +    assert(mdr->more()->waiting_on_slave.count(last) == 0);
>>>      _rename_prepare_witness(mdr, last, witnesses, srcdn, destdn, straydn);
>>>      return;
>>>    }
>>>
>>>    // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
>>>    if (!mdr->more()->slaves.empty() && !srci->is_dir())
>>> -    assert(g_conf->mds_kill_rename_at != 2);
>>> +    assert(g_conf->mds_kill_rename_at != 3);
>>>    if (!mdr->more()->slaves.empty() && srci->is_dir())
>>> -    assert(g_conf->mds_kill_rename_at != 3);
>>> -
>>> -  // -- prepare anchor updates --
>>> -  if (!linkmerge || srcdnl->is_primary()) {
>>> -    C_GatherBuilder anchorgather(g_ceph_context);
>>> -
>>> -    if (srcdnl->is_primary() &&
>>> -       (srcdnl->get_inode()->is_anchored() ||
>>> -        (srcdnl->get_inode()->is_dir() && (srcdnl->get_inode()->inode.rstat.ranchors ||
>>> -                                           srcdnl->get_inode()->nested_anchors ||
>>> -                                           !mdcache->is_leaf_subtree(mdcache->get_projected_subtree_root(srcdn->get_dir()))))) &&
>>> -       !mdr->more()->src_reanchor_atid) {
>>> -      dout(10) << "reanchoring src->dst " << *srcdnl->get_inode() << dendl;
>>> -      vector<Anchor> trace;
>>> -      destdn->make_anchor_trace(trace, srcdnl->get_inode());
>>> -      mds->anchorclient->prepare_update(srcdnl->get_inode()->ino(),
>>> -                                       trace, &mdr->more()->src_reanchor_atid,
>>> -                                       anchorgather.new_sub());
>>> -    }
>>> -    if (destdnl->is_primary() &&
>>> -       destdnl->get_inode()->is_anchored() &&
>>> -       !mdr->more()->dst_reanchor_atid) {
>>> -      dout(10) << "reanchoring dst->stray " << *destdnl->get_inode() << dendl;
>>> -
>>> -      assert(straydn);
>>> -      vector<Anchor> trace;
>>> -      straydn->make_anchor_trace(trace, destdnl->get_inode());
>>> -
>>> -      mds->anchorclient->prepare_update(destdnl->get_inode()->ino(), trace,
>>> -                 &mdr->more()->dst_reanchor_atid, anchorgather.new_sub());
>>> -    }
>>> -
>>> -    if (anchorgather.has_subs())  {
>>> -      anchorgather.set_finisher(new C_MDS_RetryRequest(mdcache, mdr));
>>> -      anchorgather.activate();
>>> -      return;  // waiting for anchor prepares
>>> -    }
>>> -
>>>      assert(g_conf->mds_kill_rename_at != 4);
>>> -  }
>>>
>>>    // -- prepare journal entry --
>>>    mdr->ls = mdlog->get_current_segment();
>>> @@ -6762,10 +6762,17 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
>>>      // abort
>>>      //  rollback_bl may be empty if we froze the inode but had to provide an expanded
>>>      // witness list from the master, and they failed before we tried prep again.
>>> -    if (mdr->more()->rollback_bl.length())
>>> -      do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
>>> -    else
>>> +    if (mdr->more()->rollback_bl.length()) {
>>> +      if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
>>> +       mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
>>> +       // rollback but preserve the slave request
>>> +       do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, NULL);
>>> +      } else
>>> +       do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
>>> +    } else {
>>>        dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
>>> +      mds->mdcache->request_finish(mdr);
>>> +    }
>>>    }
>>>  }
>>>
>>> @@ -6825,7 +6832,6 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
>>>    dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
>>>    // need to finish this update before sending resolve to claim the subtree
>>>    mds->mdcache->add_rollback(rollback.reqid, master);
>>> -  assert(mdr || mds->is_resolve());
>>>
>>>    Mutation *mut = new Mutation(rollback.reqid);
>>>    mut->ls = mds->mdlog->get_current_segment();
>>> --
>>> 1.7.11.7
>>>
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 22/39] mds: handle linkage mismatch during cache rejoin
  2013-03-26  7:21     ` Yan, Zheng
@ 2013-03-29 22:09       ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-29 22:09 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

Updated version looks good.
Reviewed-by: Greg Farnum <greg@inktank.com>

On Tue, Mar 26, 2013 at 12:21 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> Updated update
>
> Thanks
> Yan, Zheng
> ------
> From c1d3576556f5ad2849d3079845dc26ef7612e8d3 Mon Sep 17 00:00:00 2001
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> Date: Thu, 14 Mar 2013 20:06:27 +0800
> Subject: [PATCH 22/39] mds: handle linkage mismatch during cache rejoin
>
> For MDS cluster, not all file system namespace operations that impact
> multiple MDS use two phase commit. Some operations use dentry link/unlink
> message to update replica dentry's linkage after they are committed by
> the master MDS. It's possible the master MDS crashes after journaling an
> operation, but before sending the dentry link/unlink messages. Later when
> the MDS recovers and receives cache rejoin messages from the surviving
> MDS, it will find linkage mismatch.
>
> The original cache rejoin code does not properly handle the case that
> dentry unlink messages were missing. Unlinked inodes were linked to stray
> dentries. So the cache rejoin ack message need push replicas of these
> stray dentries to the surviving MDS.
>
> This patch also adds code that handles cache expiration in the middle of
> cache rejoining.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc | 342 +++++++++++++++++++++++++++++++++++------------------
>  src/mds/MDCache.h  |   1 +
>  src/mds/mdstypes.h |   3 +
>  3 files changed, 229 insertions(+), 117 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 8edb11c..0d2cac1 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -3540,7 +3540,6 @@ void MDCache::rejoin_send_rejoins()
>      } else {
>        // strong
>        if (p->first == 0 && root) {
> -       p->second->add_weak_inode(root->vino());
>         p->second->add_strong_inode(root->vino(),
>                                     root->get_replica_nonce(),
>                                     root->get_caps_wanted(),
> @@ -3554,7 +3553,6 @@ void MDCache::rejoin_send_rejoins()
>        }
>
>        if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
> -       p->second->add_weak_inode(in->vino());
>         p->second->add_strong_inode(in->vino(),
>                                     in->get_replica_nonce(),
>                                     in->get_caps_wanted(),
> @@ -3571,6 +3569,8 @@ void MDCache::rejoin_send_rejoins()
>      for (hash_map<metareqid_t, MDRequest*>::iterator p = active_requests.begin();
>          p != active_requests.end();
>          ++p) {
> +      if ( p->second->is_slave())
> +       continue;
>        // auth pins
>        for (set<MDSCacheObject*>::iterator q = p->second->remote_auth_pins.begin();
>            q != p->second->remote_auth_pins.end();
> @@ -4230,6 +4230,8 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>      rejoin_potential_updated_scatterlocks.insert(in);
>    }
>
> +  rejoin_unlinked_inodes[from].clear();
> +
>    // surviving peer may send incorrect dirfrag here (maybe they didn't
>    // get the fragment notify, or maybe we rolled back?).  we need to
>    // infer the right frag and get them with the program.  somehow.
> @@ -4336,106 +4338,118 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>
>        dn->add_replica(from, q->second.nonce);
>        dout(10) << " have " << *dn << dendl;
> -
> -      // inode?
> -      if (dnl->is_primary()) {
> -       CInode *in = dnl->get_inode();
> -       assert(in);
> -
> -       if (strong->strong_inodes.count(in->vino())) {
> -         MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->vino()];
>
> -         // caps_wanted
> -         if (is.caps_wanted) {
> -           in->mds_caps_wanted[from] = is.caps_wanted;
> -           dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
> -                    << " on " << *in << dendl;
> -         }
> -
> -         // scatterlocks?
> -         //  infer state from replica state:
> -         //   * go to MIX if they might have wrlocks
> -         //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
> -         in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
> -         in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
> -         in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
> -
> -         // auth pin?
> -         if (strong->authpinned_inodes.count(in->vino())) {
> -           MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
> -           dout(10) << " inode authpin by " << r << " on " << *in << dendl;
> -
> -           // get/create slave mdrequest
> -           MDRequest *mdr;
> -           if (have_request(r.reqid))
> -             mdr = request_get(r.reqid);
> -           else
> -             mdr = request_start_slave(r.reqid, r.attempt, from);
> -           if (strong->frozen_authpin_inodes.count(in->vino())) {
> -             assert(!in->get_num_auth_pins());
> -             mdr->freeze_auth_pin(in);
> -           } else {
> -             assert(!in->is_frozen_auth_pin());
> -           }
> -           mdr->auth_pin(in);
> -         }
> -         // xlock(s)?
> -         if (strong->xlocked_inodes.count(in->vino())) {
> -           for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->xlocked_inodes[in->vino()].begin();
> -                r != strong->xlocked_inodes[in->vino()].end();
> -                ++r) {
> -             SimpleLock *lock = in->get_lock(r->first);
> -             dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl;
> -             MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
> -             assert(mdr->is_auth_pinned(in));
> -             if (lock->is_stable())
> -               in->auth_pin(lock);
> -             lock->set_state(LOCK_XLOCK);
> -             if (lock == &in->filelock)
> -               in->loner_cap = -1;
> -             lock->get_xlock(mdr, mdr->get_client());
> -             mdr->xlocks.insert(lock);
> -             mdr->locks.insert(lock);
> -           }
> -         }
> -         // wrlock(s)?
> -         if (strong->wrlocked_inodes.count(in->vino())) {
> -           for (map<int,MMDSCacheRejoin::slave_reqid>::iterator r = strong->wrlocked_inodes[in->vino()].begin();
> -                r != strong->wrlocked_inodes[in->vino()].end();
> -                ++r) {
> -             SimpleLock *lock = in->get_lock(r->first);
> -             dout(10) << " inode wrlock by " << r->second << " on " << *lock << " on " << *in << dendl;
> -             MDRequest *mdr = request_get(r->second.reqid);  // should have this from auth_pin above.
> -             assert(mdr->is_auth_pinned(in));
> -             lock->set_state(LOCK_LOCK);
> -             if (lock == &in->filelock)
> -               in->loner_cap = -1;
> -             lock->get_wrlock(true);
> -             mdr->wrlocks.insert(lock);
> -             mdr->locks.insert(lock);
> -           }
> +      if (dnl->is_primary()) {
> +       if (q->second.is_primary()) {
> +         if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) {
> +           // the survivor missed MDentryUnlink+MDentryLink messages ?
> +           assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
> +           CInode *in = get_inode(q->second.ino, q->first.snapid);
> +           assert(in);
> +           assert(in->get_parent_dn());
> +           rejoin_unlinked_inodes[from].insert(in);
> +           dout(7) << " sender has primary dentry but wrong inode" << dendl;
>           }
>         } else {
> -         dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
> +         // the survivor missed MDentryLink message ?
> +         assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
> +         dout(7) << " sender doesn't have primay dentry" << dendl;
> +       }
> +      } else {
> +       if (q->second.is_primary()) {
> +         // the survivor missed MDentryUnlink message ?
> +         CInode *in = get_inode(q->second.ino, q->first.snapid);
> +         assert(in);
> +         assert(in->get_parent_dn());
> +         rejoin_unlinked_inodes[from].insert(in);
> +         dout(7) << " sender has primary dentry but we don't" << dendl;
>         }
> -
> -       in->add_replica(from, p->second.nonce);
> -       dout(10) << " have " << *in << dendl;
>        }
>      }
>    }
>
> -  // base inodes?  (root, stray, etc.)
> -  for (set<vinodeno_t>::iterator p = strong->weak_inodes.begin();
> -       p != strong->weak_inodes.end();
> +  for (map<vinodeno_t, MMDSCacheRejoin::inode_strong>::iterator p = strong->strong_inodes.begin();
> +       p != strong->strong_inodes.end();
>         ++p) {
> -    CInode *in = get_inode(*p);
> -    dout(10) << " have base " << *in << dendl;
> -    in->add_replica(from);
> +    CInode *in = get_inode(p->first);
> +    assert(in);
> +    in->add_replica(from, p->second.nonce);
> +    dout(10) << " have " << *in << dendl;
> +
> +    MMDSCacheRejoin::inode_strong &is = p->second;
> +
> +    // caps_wanted
> +    if (is.caps_wanted) {
> +      in->mds_caps_wanted[from] = is.caps_wanted;
> +      dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
> +              << " on " << *in << dendl;
> +    }
> +
> +    // scatterlocks?
> +    //  infer state from replica state:
> +    //   * go to MIX if they might have wrlocks
> +    //   * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
> +    in->filelock.infer_state_from_strong_rejoin(is.filelock, true);  // maybe also go to LOCK
> +    in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
> +    in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
> +
> +    // auth pin?
> +    if (strong->authpinned_inodes.count(in->vino())) {
> +      MMDSCacheRejoin::slave_reqid r = strong->authpinned_inodes[in->vino()];
> +      dout(10) << " inode authpin by " << r << " on " << *in << dendl;
> +
> +      // get/create slave mdrequest
> +      MDRequest *mdr;
> +      if (have_request(r.reqid))
> +       mdr = request_get(r.reqid);
> +      else
> +       mdr = request_start_slave(r.reqid, r.attempt, from);
> +      if (strong->frozen_authpin_inodes.count(in->vino())) {
> +       assert(!in->get_num_auth_pins());
> +       mdr->freeze_auth_pin(in);
> +      } else {
> +       assert(!in->is_frozen_auth_pin());
> +      }
> +      mdr->auth_pin(in);
> +    }
> +    // xlock(s)?
> +    if (strong->xlocked_inodes.count(in->vino())) {
> +      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->xlocked_inodes[in->vino()].begin();
> +          q != strong->xlocked_inodes[in->vino()].end();
> +          ++q) {
> +       SimpleLock *lock = in->get_lock(q->first);
> +       dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl;
> +       MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
> +       assert(mdr->is_auth_pinned(in));
> +       if (lock->is_stable())
> +         in->auth_pin(lock);
> +       lock->set_state(LOCK_XLOCK);
> +       if (lock == &in->filelock)
> +         in->loner_cap = -1;
> +       lock->get_xlock(mdr, mdr->get_client());
> +       mdr->xlocks.insert(lock);
> +       mdr->locks.insert(lock);
> +      }
> +    }
> +    // wrlock(s)?
> +    if (strong->wrlocked_inodes.count(in->vino())) {
> +      for (map<int,MMDSCacheRejoin::slave_reqid>::iterator q = strong->wrlocked_inodes[in->vino()].begin();
> +          q != strong->wrlocked_inodes[in->vino()].end();
> +          ++q) {
> +       SimpleLock *lock = in->get_lock(q->first);
> +       dout(10) << " inode wrlock by " << q->second << " on " << *lock << " on " << *in << dendl;
> +       MDRequest *mdr = request_get(q->second.reqid);  // should have this from auth_pin above.
> +       assert(mdr->is_auth_pinned(in));
> +       lock->set_state(LOCK_LOCK);
> +       if (lock == &in->filelock)
> +         in->loner_cap = -1;
> +       lock->get_wrlock(true);
> +       mdr->wrlocks.insert(lock);
> +       mdr->locks.insert(lock);
> +      }
> +    }
>    }
>
> -
> -
>    // done?
>    assert(rejoin_gather.count(from));
>    rejoin_gather.erase(from);
> @@ -4452,6 +4466,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>    dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
>    int from = ack->get_source().num();
>
> +  // for sending cache expire message
> +  list<CInode*> isolated_inodes;
> +
>    // dirs
>    for (map<dirfrag_t, MMDSCacheRejoin::dirfrag_strong>::iterator p = ack->strong_dirfrags.begin();
>         p != ack->strong_dirfrags.end();
> @@ -4459,7 +4476,29 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>      // we may have had incorrect dir fragmentation; refragment based
>      // on what they auth tells us.
>      CDir *dir = get_force_dirfrag(p->first);
> -    assert(dir);
> +    if (!dir) {
> +      CInode *diri = get_inode(p->first.ino);
> +      if (!diri) {
> +       // barebones inode; the full inode loop below will clean up.
> +       diri = new CInode(this, false);
> +       diri->inode.ino = p->first.ino;
> +       diri->inode.mode = S_IFDIR;
> +       if (MDS_INO_MDSDIR(p->first.ino)) {
> +         diri->inode_auth = pair<int,int>(from, CDIR_AUTH_UNKNOWN);
> +         add_inode(diri);
> +         dout(10) << " add inode " << *diri << dendl;
> +       } else {
> +         diri->inode_auth = CDIR_AUTH_UNDEF;
> +         isolated_inodes.push_back(diri);
> +         dout(10) << " unconnected dirfrag " << p->first << dendl;
> +       }
> +      }
> +      // barebones dirfrag; the full dirfrag loop below will clean up.
> +      dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false));
> +      if (dir->authority().first != from)
> +       adjust_subtree_auth(dir, from);
> +      dout(10) << " add dirfrag " << *dir << dendl;
> +    }
>
>      dir->set_replica_nonce(p->second.nonce);
>      dir->state_clear(CDir::STATE_REJOINING);
> @@ -4471,7 +4510,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>          q != dmap.end();
>          ++q) {
>        CDentry *dn = dir->lookup(q->first.name, q->first.snapid);
> -      assert(dn);
> +      if(!dn)
> +       dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid);
> +
>        CDentry::linkage_t *dnl = dn->get_linkage();
>
>        assert(dn->last == q->first.snapid);
> @@ -4480,33 +4521,48 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>         dn->first = q->second.first;
>        }
>
> +      // may have bad linkage if we missed dentry link/unlink messages
> +      if (dnl->is_primary()) {
> +       CInode *in = dnl->get_inode();
> +       if (!q->second.is_primary() ||
> +           vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) {
> +         dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
> +         dir->unlink_inode(dn);
> +       }
> +      } else if (dnl->is_remote()) {
> +       if (!q->second.is_remote() ||
> +           q->second.remote_ino != dnl->get_remote_ino() ||
> +           q->second.remote_d_type != dnl->get_remote_d_type()) {
> +         dout(10) << " had bad linkage for " << *dn <<  dendl;
> +         dir->unlink_inode(dn);
> +       }
> +      } else {
> +       if (!q->second.is_null())
> +         dout(10) << " had bad linkage for " << *dn <<  dendl;
> +      }
> +
>        // hmm, did we have the proper linkage here?
> -      if (dnl->is_null() &&
> -         !q->second.is_null()) {
> -       dout(10) << " had bad (missing) linkage for " << *dn << dendl;
> +      if (dnl->is_null() && !q->second.is_null()) {
>         if (q->second.is_remote()) {
>           dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type);
>         } else {
>           CInode *in = get_inode(q->second.ino, q->first.snapid);
> -         assert(in == 0);  // a rename would have been caught be the resolve stage.
> -         // barebones inode; the full inode loop below will clean up.
> -         in = new CInode(this, false, q->second.first, q->first.snapid);
> -         in->inode.ino = q->second.ino;
> -         add_inode(in);
> +         if (!in) {
> +           // barebones inode; assume it's dir, the full inode loop below will clean up.
> +           in = new CInode(this, false, q->second.first, q->first.snapid);
> +           in->inode.ino = q->second.ino;
> +           in->inode.mode = S_IFDIR;
> +           add_inode(in);
> +           dout(10) << " add inode " << *in << dendl;
> +         } else if (in->get_parent_dn()) {
> +           dout(10) << " had bad linkage for " << *(in->get_parent_dn())
> +                    << ", unlinking " << *in << dendl;
> +           in->get_parent_dir()->unlink_inode(in->get_parent_dn());
> +         }
>           dn->dir->link_primary_inode(dn, in);
>         }
>        }
> -      else if (!dnl->is_null() &&
> -              q->second.is_null()) {
> -       dout(0) << " had bad linkage for " << *dn << dendl;
> -       /*
> -        * this should happen:
> -        *  if we're a survivor, any unlink should commit or rollback during
> -        * the resolve stage.
> -        *  if we failed, we shouldn't have non-auth leaf dentries at all
> -        */
> -       assert(0);  // uh oh.
> -      }
> +
>        dn->set_replica_nonce(q->second.nonce);
>        dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
>        dn->state_clear(CDentry::STATE_REJOINING);
> @@ -4565,6 +4621,21 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
>      dout(10) << " got inode locks " << *in << dendl;
>    }
>
> +  // trim unconnected subtree
> +  if (!isolated_inodes.empty()) {
> +    map<int, MCacheExpire*> expiremap;
> +    for (list<CInode*>::iterator p = isolated_inodes.begin();
> +        p != isolated_inodes.end();
> +        ++p) {
> +      list<CDir*> ls;
> +      (*p)->get_dirfrags(ls);
> +      trim_dirfrag(*ls.begin(), 0, expiremap);
> +      assert((*p)->get_num_ref() == 0);
> +      delete *p;
> +    }
> +    send_expire_messages(expiremap);
> +  }
> +
>    // done?
>    assert(rejoin_ack_gather.count(from));
>    rejoin_ack_gather.erase(from);
> @@ -5165,6 +5236,37 @@ void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snap
>  void MDCache::rejoin_send_acks()
>  {
>    dout(7) << "rejoin_send_acks" << dendl;
> +
> +  // replicate stray
> +  for (map<int, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
> +       p != rejoin_unlinked_inodes.end();
> +       ++p) {
> +    for (set<CInode*>::iterator q = p->second.begin();
> +        q != p->second.end();
> +        ++q) {
> +      CInode *in = *q;
> +      dout(7) << " unlinked inode " << *in << dendl;
> +      // inode expired
> +      if (!in->is_replica(p->first))
> +       continue;
> +      while (1) {
> +       CDentry *dn = in->get_parent_dn();
> +       if (dn->is_replica(p->first))
> +         break;
> +       dn->add_replica(p->first);
> +       CDir *dir = dn->get_dir();
> +       if (dir->is_replica(p->first))
> +         break;
> +       dir->add_replica(p->first);
> +       in = dir->get_inode();
> +       if (in->is_replica(p->first))
> +         break;
> +       if (in->is_base())
> +         break;
> +      }
> +    }
> +  }
> +  rejoin_unlinked_inodes.clear();
>
>    // send acks to everyone in the recovery set
>    map<int,MMDSCacheRejoin*> ack;
> @@ -5204,23 +5306,29 @@ void MDCache::rejoin_send_acks()
>         CDentry *dn = q->second;
>         CDentry::linkage_t *dnl = dn->get_linkage();
>
> +       // inode
> +       CInode *in = NULL;
> +       if (dnl->is_primary())
> +         in = dnl->get_inode();
> +
>         // dentry
>         for (map<int,int>::iterator r = dn->replicas_begin();
>              r != dn->replicas_end();
> -            ++r)
> +            ++r) {
>           ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last,
>                                            dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
>                                            dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
>                                            dnl->is_remote() ? dnl->get_remote_d_type():0,
>                                            ++r->second,
>                                            dn->lock.get_replica_state());
> +         // peer missed MDentrylink message ?
> +         if (in && !in->is_replica(r->first))
> +           in->add_replica(r->first);
> +       }
>
> -       if (!dnl->is_primary())
> +       if (!in)
>           continue;
>
> -       // inode
> -       CInode *in = dnl->get_inode();
> -
>         for (map<int,int>::iterator r = in->replicas_begin();
>              r != in->replicas_end();
>              ++r) {
> diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
> index 2a65d0a..73780e2 100644
> --- a/src/mds/MDCache.h
> +++ b/src/mds/MDCache.h
> @@ -416,6 +416,7 @@ protected:
>    set<CInode*> rejoin_undef_inodes;
>    set<CInode*> rejoin_potential_updated_scatterlocks;
>    set<CDir*>   rejoin_undef_dirfrags;
> +  map<int, set<CInode*> > rejoin_unlinked_inodes;
>
>    vector<CInode*> rejoin_recover_q, rejoin_check_q;
>    list<Context*> rejoin_waiters;
> diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
> index cf5c1a7..aa9d165 100644
> --- a/src/mds/mdstypes.h
> +++ b/src/mds/mdstypes.h
> @@ -235,6 +235,9 @@ WRITE_CLASS_ENCODER(vinodeno_t)
>  inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
>    return l.ino == r.ino && l.snapid == r.snapid;
>  }
> +inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
> +  return !(l == r);
> +}
>  inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
>    return
>      l.ino < r.ino ||
> --
> 1.7.11.7
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 04/39] mds: make sure table request id unique
  2013-03-25 11:30                 ` Yan, Zheng
@ 2013-03-29 22:12                   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-03-29 22:12 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Sage Weil, ceph-devel

This looks good to me and Sage liked the shape of it.
Reviewed-by: Greg Farnum <greg@inktank.com>

We do still need to update the mds protocol version, and I'd like to
switch over the messages that are already changed to the new encoding
system at the same time. I'm happy to do all that but will wait to
hear back about that patch I think we can drop before doing so.

Thanks very much for all the work! :)
-Greg

On Mon, Mar 25, 2013 at 4:30 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 03/22/2013 06:03 AM, Gregory Farnum wrote:
>> Right. I'd like to somehow mark those reqid's so that we can tell when
>> they come from a different incarnation of the MDS TableClient daemon.
>> One way is via some piece of random data that will probably
>> distinguish them, although if we have something which we can know is
>> different that would be preferable. I think we can work something out
>> of the startup session data each MDS does with the monitors, but I'm
>> not sure I can check any time soon; I have a number of other things to
>> get to now that I've gotten through (the first round on) this series.
>>
>
> How about the attached patch?
>
> Thanks
>
> ----
> commit d460b766e16ec2cacac239a74af0e226108ab95a
> Author: Yan, Zheng <zheng.z.yan@intel.com>
> Date:   Sat Mar 16 08:02:18 2013 +0800
>
>     mds: make sure table request id unique
>
>     When a MDS becomes active, the table server re-sends 'agree' messages
>     for old prepared request. If the recoverd MDS starts a new table request
>     at the same time, The new request's ID can happen to be the same as old
>     prepared request's ID, because current table client code assigns request
>     ID from zero after MDS restarts.
>
>     This patch make table server send 'ready' messages when table clients
>     become active or itself becomes active. The 'ready' message updates
>     table client's last_reqid to avoid request ID collision. The message
>     also replaces the roles of finish_recovery() and handle_mds_recovery()
>     callbacks for table client.
>
>     Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>
> diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
> index bb1c833..834a7aa 100644
> --- a/src/mds/MDS.cc
> +++ b/src/mds/MDS.cc
> @@ -1508,14 +1508,13 @@ void MDS::recovery_done()
>
>    // kick anchortable (resent AGREEs)
>    if (mdsmap->get_tableserver() == whoami) {
> -    anchorserver->finish_recovery();
> -    snapserver->finish_recovery();
> +    set<int> active;
> +    mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE);
> +    mdsmap->get_mds_set(active, MDSMap::STATE_STOPPING);
> +    anchorserver->finish_recovery(active);
> +    snapserver->finish_recovery(active);
>    }
> -
> -  // kick anchorclient (resent COMMITs)
> -  anchorclient->finish_recovery();
> -  snapclient->finish_recovery();
> -
> +
>    mdcache->start_recovered_truncates();
>    mdcache->do_file_recover();
>
> @@ -1537,8 +1536,6 @@ void MDS::handle_mds_recovery(int who)
>      anchorserver->handle_mds_recovery(who);
>      snapserver->handle_mds_recovery(who);
>    }
> -  anchorclient->handle_mds_recovery(who);
> -  snapclient->handle_mds_recovery(who);
>
>    queue_waiters(waiting_for_active_peer[who]);
>    waiting_for_active_peer.erase(who);
> diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
> index ea021f5..29e172b 100644
> --- a/src/mds/MDSTableClient.cc
> +++ b/src/mds/MDSTableClient.cc
> @@ -101,6 +101,16 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
>      }
>      break;
>
> +  case TABLESERVER_OP_SERVER_READY:
> +    if (last_reqid == 0) {
> +      assert(reqid > 0);
> +      last_reqid = reqid;
> +    }
> +
> +    resend_queries();
> +    resend_prepares();
> +    resend_commits();
> +    break;
>    default:
>      assert(0);
>    }
> @@ -126,19 +136,23 @@ void MDSTableClient::_logged_ack(version_t tid)
>  void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl,
>                               Context *onfinish)
>  {
> -  uint64_t reqid = ++last_reqid;
> -  dout(10) << "_prepare " << reqid << dendl;
> -
> -  // send message
> -  MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
> -  req->bl = mutation;
> -
> -  pending_prepare[reqid].mutation = mutation;
> -  pending_prepare[reqid].ptid = ptid;
> -  pending_prepare[reqid].pbl = pbl;
> -  pending_prepare[reqid].onfinish = onfinish;
> -
> -  send_to_tableserver(req);
> +  if (last_reqid > 0) {
> +    uint64_t reqid = ++last_reqid;
> +    dout(10) << "_prepare " << reqid << dendl;
> +    // send message
> +    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
> +    req->bl = mutation;
> +
> +    pending_prepare[reqid].mutation = mutation;
> +    pending_prepare[reqid].ptid = ptid;
> +    pending_prepare[reqid].pbl = pbl;
> +    pending_prepare[reqid].onfinish = onfinish;
> +
> +    send_to_tableserver(req);
> +  } else {
> +    dout(10) << "table server is not ready yet, waiting" << dendl;
> +    waiting_for_server.push_back(_pending_prepare(onfinish, ptid, pbl, mutation));
> +  }
>  }
>
>  void MDSTableClient::send_to_tableserver(MMDSTableRequest *req)
> @@ -176,6 +190,7 @@ void MDSTableClient::got_journaled_agree(version_t tid, LogSegment *ls)
>    ls->pending_commit_tids[table].insert(tid);
>    pending_commit[tid] = ls;
>  }
> +
>  void MDSTableClient::got_journaled_ack(version_t tid)
>  {
>    dout(10) << "got_journaled_ack " << tid << dendl;
> @@ -185,12 +200,6 @@ void MDSTableClient::got_journaled_ack(version_t tid)
>    }
>  }
>
> -void MDSTableClient::finish_recovery()
> -{
> -  dout(7) << "finish_recovery" << dendl;
> -  resend_commits();
> -}
> -
>  void MDSTableClient::resend_commits()
>  {
>    for (map<version_t,LogSegment*>::iterator p = pending_commit.begin();
> @@ -202,24 +211,18 @@ void MDSTableClient::resend_commits()
>    }
>  }
>
> -void MDSTableClient::handle_mds_recovery(int who)
> +void MDSTableClient::resend_prepares()
>  {
> -  dout(7) << "handle_mds_recovery mds." << who << dendl;
> -
> -  if (who != mds->mdsmap->get_tableserver())
> -    return; // do nothing.
> -
> -  resend_queries();
> -
> -  // prepares.
> +  while (!waiting_for_server.empty()) {
> +    pending_prepare[++last_reqid] = waiting_for_server.front();
> +    waiting_for_server.pop_front();
> +  }
>    for (map<uint64_t, _pending_prepare>::iterator p = pending_prepare.begin();
>         p != pending_prepare.end();
>         ++p) {
> -    dout(10) << "resending " << p->first << dendl;
> +    dout(10) << "resending prepare on " << p->first << dendl;
>      MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, p->first);
>      req->bl = p->second.mutation;
>      mds->send_message_mds(req, mds->mdsmap->get_tableserver());
> -  }
> -
> -  resend_commits();
> +  }
>  }
> diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
> index e15837f..7638260 100644
> --- a/src/mds/MDSTableClient.h
> +++ b/src/mds/MDSTableClient.h
> @@ -38,9 +38,12 @@ protected:
>      bufferlist mutation;
>
>      _pending_prepare() : onfinish(0), ptid(0), pbl(0) {}
> +    _pending_prepare(Context *c, version_t *pt, bufferlist *pb, bufferlist& m) :
> +      onfinish(c), ptid(pt), pbl(pb), mutation(m) {}
>    };
>
>    map<uint64_t, _pending_prepare> pending_prepare;
> +  list<_pending_prepare> waiting_for_server;
>
>    // pending commits
>    map<version_t, LogSegment*> pending_commit;
> @@ -68,9 +71,8 @@ public:
>    void _prepare(bufferlist& mutation, version_t *ptid, bufferlist *pbl, Context *onfinish);
>    void commit(version_t tid, LogSegment *ls);
>
> -  // for recovery (by other nodes)
> -  void handle_mds_recovery(int mds); // called when someone else recovers
>    void resend_commits();
> +  void resend_prepares();
>
>    // for recovery (by me)
>    void got_journaled_agree(version_t tid, LogSegment *ls);
> @@ -82,7 +84,6 @@ public:
>    void wait_for_ack(version_t tid, Context *c) {
>      ack_waiters[tid].push_back(c);
>    }
> -  void finish_recovery();                // called when i recover and go active
>
>    void send_to_tableserver(MMDSTableRequest *req);
>
> diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
> index 4f86ff1..e56e2b4 100644
> --- a/src/mds/MDSTableServer.cc
> +++ b/src/mds/MDSTableServer.cc
> @@ -144,24 +144,30 @@ void MDSTableServer::do_server_update(bufferlist& bl)
>
>  // recovery
>
> -void MDSTableServer::finish_recovery()
> +void MDSTableServer::finish_recovery(set<int>& active)
>  {
>    dout(7) << "finish_recovery" << dendl;
> -  handle_mds_recovery(-1);  // resend agrees for everyone.
> +  for (set<int>::iterator p = active.begin(); p != active.end(); ++p)
> +    handle_mds_recovery(*p);  // resend agrees for everyone.
>  }
>
>  void MDSTableServer::handle_mds_recovery(int who)
>  {
> -  if (who >= 0)
> -    dout(7) << "handle_mds_recovery mds." << who << dendl;
> -
> +  dout(7) << "handle_mds_recovery mds." << who << dendl;
> +
> +  uint64_t next_reqid = 1;
>    // resend agrees for recovered mds
>    for (map<version_t,mds_table_pending_t>::iterator p = pending_for_mds.begin();
>         p != pending_for_mds.end();
>         ++p) {
> -    if (who >= 0 && p->second.mds != who)
> +    if (p->second.mds != who)
>        continue;
> +    if (p->second.reqid >= next_reqid)
> +      next_reqid = p->second.reqid + 1;
>      MMDSTableRequest *reply = new MMDSTableRequest(table, TABLESERVER_OP_AGREE, p->second.reqid, p->second.tid);
> -    mds->send_message_mds(reply, p->second.mds);
> +    mds->send_message_mds(reply, who);
>    }
> +
> +  MMDSTableRequest *reply = new MMDSTableRequest(table, TABLESERVER_OP_SERVER_READY, next_reqid);
> +  mds->send_message_mds(reply, who);
>  }
> diff --git a/src/mds/MDSTableServer.h b/src/mds/MDSTableServer.h
> index 26cd594..55827e7 100644
> --- a/src/mds/MDSTableServer.h
> +++ b/src/mds/MDSTableServer.h
> @@ -90,7 +90,7 @@ private:
>    }
>
>    // recovery
> -  void finish_recovery();
> +  void finish_recovery(set<int>& active);
>    void handle_mds_recovery(int who);
>  };
>
> diff --git a/src/mds/mds_table_types.h b/src/mds/mds_table_types.h
> index b094c75..c08519a 100644
> --- a/src/mds/mds_table_types.h
> +++ b/src/mds/mds_table_types.h
> @@ -39,6 +39,7 @@ enum {
>    TABLESERVER_OP_ACK          = -6,
>    TABLESERVER_OP_ROLLBACK     =  7,
>    TABLESERVER_OP_SERVER_UPDATE = 8,
> +  TABLESERVER_OP_SERVER_READY = -9,
>  };
>
>  inline const char *get_mdstableserver_opname(int op) {
> @@ -51,6 +52,7 @@ inline const char *get_mdstableserver_opname(int op) {
>    case TABLESERVER_OP_ACK: return "ack";
>    case TABLESERVER_OP_ROLLBACK: return "rollback";
>    case TABLESERVER_OP_SERVER_UPDATE: return "server_update";
> +  case TABLESERVER_OP_SERVER_READY: return "server_ready";
>    default: assert(0); return 0;
>    }
>  };
>

^ permalink raw reply	[flat|nested] 117+ messages in thread

* Re: [PATCH 06/39] mds: make table client/server tolerate duplicated message
  2013-03-29 22:00   ` Gregory Farnum
@ 2013-03-31 13:21     ` Yan, Zheng
  0 siblings, 0 replies; 117+ messages in thread
From: Yan, Zheng @ 2013-03-31 13:21 UTC (permalink / raw)
  To: Gregory Farnum; +Cc: ceph-devel, Sage Weil

On 03/30/2013 06:00 AM, Gregory Farnum wrote:
> I believe this patch has been outdated thanks to the tid exchange
> you're doing now, right?
> -Greg

tid exchange does not avoid duplicated prepare/commit messages, but it makes avoidance of
duplicated messages easier. How about the patch below?

Thanks
Yan, Zheng

----
From e3d7b3e1d757aee847384180e2d6ee59a900ca05 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Sun, 31 Mar 2013 17:54:50 +0800
Subject: [PATCH] mds: avoid sending duplicated table prepare/commit

This patch makes table client defer sending table prepare/commit messages
until receiving table server's 'ready' message. This avoid duplicated table
prepare/commit messages.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/AnchorClient.cc   |  9 +++++--
 src/mds/MDS.cc            | 14 +++++++++--
 src/mds/MDS.h             |  4 +++-
 src/mds/MDSTableClient.cc | 60 +++++++++++++++++++++++++----------------------
 src/mds/MDSTableClient.h  |  7 ++++--
 5 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/src/mds/AnchorClient.cc b/src/mds/AnchorClient.cc
index 455e97f..bcc8710 100644
--- a/src/mds/AnchorClient.cc
+++ b/src/mds/AnchorClient.cc
@@ -41,7 +41,9 @@ void AnchorClient::handle_query_result(class MMDSTableRequest *m)
   ::decode(ino, p);
   ::decode(trace, p);
 
-  assert(pending_lookup.count(ino));
+  if (!pending_lookup.count(ino))
+    return;
+
   list<_pending_lookup> ls;
   ls.swap(pending_lookup[ino]);
   pending_lookup.erase(ino);
@@ -80,9 +82,12 @@ void AnchorClient::lookup(inodeno_t ino, vector<Anchor>& trace, Context *onfinis
 
 void AnchorClient::_lookup(inodeno_t ino)
 {
+  int ts = mds->mdsmap->get_tableserver();
+  if (mds->mdsmap->get_state(ts) < MDSMap::STATE_REJOIN)
+    return;
   MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_QUERY, 0, 0);
   ::encode(ino, req->bl);
-  mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  mds->send_message_mds(req, ts);
 }
 
 
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 32bb064..2d48815 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1050,7 +1050,7 @@ void MDS::handle_mds_map(MMDSMap *m)
     for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p)
       if (oldfailed.count(*p) == 0) {
 	messenger->mark_down(oldmap->get_inst(*p).addr);
-	mdcache->handle_mds_failure(*p);
+	handle_mds_failure(*p);
       }
     
     // or down then up?
@@ -1061,7 +1061,7 @@ void MDS::handle_mds_map(MMDSMap *m)
       if (oldmap->have_inst(*p) &&
 	  oldmap->get_inst(*p) != mdsmap->get_inst(*p)) {
 	messenger->mark_down(oldmap->get_inst(*p).addr);
-	mdcache->handle_mds_failure(*p);
+	handle_mds_failure(*p);
       }
   }
   if (is_clientreplay() || is_active() || is_stopping()) {
@@ -1548,6 +1548,16 @@ void MDS::handle_mds_recovery(int who)
   waiting_for_active_peer.erase(who);
 }
 
+void MDS::handle_mds_failure(int who)
+{
+  dout(5) << "handle_mds_failure mds." << who << dendl;
+
+  mdcache->handle_mds_failure(who);
+
+  anchorclient->handle_mds_failure(who);
+  snapclient->handle_mds_failure(who);
+}
+
 void MDS::stopping_start()
 {
   dout(2) << "stopping_start" << dendl;
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
index 42e8516..6658cf0 100644
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -378,13 +378,15 @@ class MDS : public Dispatcher {
   void rejoin_joint_start();
   void rejoin_done();
   void recovery_done();
-  void handle_mds_recovery(int who);
   void clientreplay_start();
   void clientreplay_done();
   void active_start();
   void stopping_start();
   void stopping_done();
 
+  void handle_mds_recovery(int who);
+  void handle_mds_failure(int who);
+
   void suicide();
   void respawn();
 
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
index 12331f9..2ce3286 100644
--- a/src/mds/MDSTableClient.cc
+++ b/src/mds/MDSTableClient.cc
@@ -65,18 +65,15 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
       }
     } 
     else if (pending_commit.count(tid)) {
-      dout(10) << "stray agree on " << reqid
-	       << " tid " << tid
-	       << ", already committing, resending COMMIT"
-	       << dendl;      
-      MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
-      mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+      dout(10) << "stray agree on " << reqid << " tid " << tid
+	       << ", already committing, will resend COMMIT" << dendl;
+      assert(!server_ready);
+      // will re-send commit when receiving the server ready message
     }
     else {
-      dout(10) << "stray agree on " << reqid
-	       << " tid " << tid
-	       << ", sending ROLLBACK"
-	       << dendl;      
+      dout(10) << "stray agree on " << reqid << " tid " << tid
+	       << ", sending ROLLBACK" << dendl;
+      assert(!server_ready);
       MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_ROLLBACK, 0, tid);
       mds->send_message_mds(req, mds->mdsmap->get_tableserver());
     }
@@ -102,6 +99,9 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
     break;
 
   case TABLESERVER_OP_SERVER_READY:
+    assert(!server_ready);
+    server_ready = true;
+
     if (last_reqid == ~0ULL)
       last_reqid = reqid;
 
@@ -144,26 +144,18 @@ void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist
   uint64_t reqid = ++last_reqid;
   dout(10) << "_prepare " << reqid << dendl;
 
-  // send message
-  MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
-  req->bl = mutation;
-
   pending_prepare[reqid].mutation = mutation;
   pending_prepare[reqid].ptid = ptid;
   pending_prepare[reqid].pbl = pbl;
   pending_prepare[reqid].onfinish = onfinish;
 
-  send_to_tableserver(req);
-}
-
-void MDSTableClient::send_to_tableserver(MMDSTableRequest *req)
-{
-  int ts = mds->mdsmap->get_tableserver();
-  if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY)
-    mds->send_message_mds(req, ts);
-  else {
-    dout(10) << " deferring request to not-yet-active tableserver mds." << ts << dendl;
-  }
+  if (server_ready) {
+    // send message
+    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
+    req->bl = mutation;
+    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  } else
+    dout(10) << "tableserver is not ready yet, deferring request" << dendl;
 }
 
 void MDSTableClient::commit(version_t tid, LogSegment *ls)
@@ -176,9 +168,12 @@ void MDSTableClient::commit(version_t tid, LogSegment *ls)
 
   assert(g_conf->mds_kill_mdstable_at != 4);
 
-  // send message
-  MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
-  send_to_tableserver(req);
+  if (server_ready) {
+    // send message
+    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
+    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  } else
+    dout(10) << "tableserver is not ready yet, deferring request" << dendl;
 }
 
 
@@ -228,3 +223,12 @@ void MDSTableClient::resend_prepares()
     mds->send_message_mds(req, mds->mdsmap->get_tableserver());
   }
 }
+
+void MDSTableClient::handle_mds_failure(int who)
+{
+  if (who != mds->mdsmap->get_tableserver())
+    return; // do nothing.
+
+  dout(7) << "tableserver mds." << who << " fails" << dendl;
+  server_ready = false;
+}
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
index 934f5fe..f8a84eb 100644
--- a/src/mds/MDSTableClient.h
+++ b/src/mds/MDSTableClient.h
@@ -30,6 +30,8 @@ protected:
 
   uint64_t last_reqid;
 
+  bool server_ready;
+
   // prepares
   struct _pending_prepare {
     Context *onfinish;
@@ -63,7 +65,8 @@ protected:
   void _logged_ack(version_t tid);
 
 public:
-  MDSTableClient(MDS *m, int tab) : mds(m), table(tab), last_reqid(~0ULL) {}
+  MDSTableClient(MDS *m, int tab) :
+    mds(m), table(tab), last_reqid(~0ULL), server_ready(false) {}
   virtual ~MDSTableClient() {}
 
   void handle_request(MMDSTableRequest *m);
@@ -85,7 +88,7 @@ public:
     ack_waiters[tid].push_back(c);
   }
 
-  void send_to_tableserver(MMDSTableRequest *req);
+  void handle_mds_failure(int mds);
 
   // child must implement
   virtual void resend_queries() = 0;
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* Re: [PATCH 00/39] fixes for MDS cluster recovery
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (38 preceding siblings ...)
  2013-03-17 14:51 ` [PATCH 39/39] mds: clear scatter dirty if replica inode has no auth subtree Yan, Zheng
@ 2013-04-01  8:46 ` Yan, Zheng
  2013-04-01 17:00   ` Gregory Farnum
  2013-04-01  8:51 ` [PATCH] mds: avoid sending duplicated table prepare/commit Yan, Zheng
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-04-01  8:46 UTC (permalink / raw)
  To: ceph-devel, sage, greg

On 03/17/2013 10:51 PM, Yan, Zheng wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> 
> This serie fixes issues I encountered when running random MDS restart tests.
> With these patches, my 3 MDS setup that runs fsstress + thrash_exports can
> survive restarting one or two MDS dozens of times.
> 
> But there still are lots of unsolved problems, Sometimes rstat corruption,
> request hangs ...
> 
> This patch series are also in:
>   git://github.com/ukernel/ceph.git wip-mds
> 

I rebased these patches. Replaced patch 5 and patch 6 with two new patches.

[PATCH] mds: avoid sending duplicated table prepare/commit
[PATCH] mds: don't roll back prepared table updates

Regards
Yan, Zheng

^ permalink raw reply	[flat|nested] 117+ messages in thread

* [PATCH] mds: avoid sending duplicated table prepare/commit
  2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
                   ` (39 preceding siblings ...)
  2013-04-01  8:46 ` [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
@ 2013-04-01  8:51 ` Yan, Zheng
  2013-04-01  8:51   ` [PATCH] mds: don't roll back prepared table updates Yan, Zheng
  40 siblings, 1 reply; 117+ messages in thread
From: Yan, Zheng @ 2013-04-01  8:51 UTC (permalink / raw)
  To: ceph-devel, sage, greg; +Cc: Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

This patch makes table client defer sending table prepare/commit messages
until receiving table server's 'ready' message. This avoid duplicated table
prepare/commit messages.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/AnchorClient.cc   |  9 +++++--
 src/mds/MDS.cc            | 14 +++++++++--
 src/mds/MDS.h             |  4 +++-
 src/mds/MDSTableClient.cc | 60 +++++++++++++++++++++++++----------------------
 src/mds/MDSTableClient.h  |  7 ++++--
 5 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/src/mds/AnchorClient.cc b/src/mds/AnchorClient.cc
index 455e97f..bcc8710 100644
--- a/src/mds/AnchorClient.cc
+++ b/src/mds/AnchorClient.cc
@@ -41,7 +41,9 @@ void AnchorClient::handle_query_result(class MMDSTableRequest *m)
   ::decode(ino, p);
   ::decode(trace, p);
 
-  assert(pending_lookup.count(ino));
+  if (!pending_lookup.count(ino))
+    return;
+
   list<_pending_lookup> ls;
   ls.swap(pending_lookup[ino]);
   pending_lookup.erase(ino);
@@ -80,9 +82,12 @@ void AnchorClient::lookup(inodeno_t ino, vector<Anchor>& trace, Context *onfinis
 
 void AnchorClient::_lookup(inodeno_t ino)
 {
+  int ts = mds->mdsmap->get_tableserver();
+  if (mds->mdsmap->get_state(ts) < MDSMap::STATE_REJOIN)
+    return;
   MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_QUERY, 0, 0);
   ::encode(ino, req->bl);
-  mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  mds->send_message_mds(req, ts);
 }
 
 
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 32bb064..2d48815 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1050,7 +1050,7 @@ void MDS::handle_mds_map(MMDSMap *m)
     for (set<int>::iterator p = failed.begin(); p != failed.end(); ++p)
       if (oldfailed.count(*p) == 0) {
 	messenger->mark_down(oldmap->get_inst(*p).addr);
-	mdcache->handle_mds_failure(*p);
+	handle_mds_failure(*p);
       }
     
     // or down then up?
@@ -1061,7 +1061,7 @@ void MDS::handle_mds_map(MMDSMap *m)
       if (oldmap->have_inst(*p) &&
 	  oldmap->get_inst(*p) != mdsmap->get_inst(*p)) {
 	messenger->mark_down(oldmap->get_inst(*p).addr);
-	mdcache->handle_mds_failure(*p);
+	handle_mds_failure(*p);
       }
   }
   if (is_clientreplay() || is_active() || is_stopping()) {
@@ -1548,6 +1548,16 @@ void MDS::handle_mds_recovery(int who)
   waiting_for_active_peer.erase(who);
 }
 
+void MDS::handle_mds_failure(int who)
+{
+  dout(5) << "handle_mds_failure mds." << who << dendl;
+
+  mdcache->handle_mds_failure(who);
+
+  anchorclient->handle_mds_failure(who);
+  snapclient->handle_mds_failure(who);
+}
+
 void MDS::stopping_start()
 {
   dout(2) << "stopping_start" << dendl;
diff --git a/src/mds/MDS.h b/src/mds/MDS.h
index 42e8516..6658cf0 100644
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -378,13 +378,15 @@ class MDS : public Dispatcher {
   void rejoin_joint_start();
   void rejoin_done();
   void recovery_done();
-  void handle_mds_recovery(int who);
   void clientreplay_start();
   void clientreplay_done();
   void active_start();
   void stopping_start();
   void stopping_done();
 
+  void handle_mds_recovery(int who);
+  void handle_mds_failure(int who);
+
   void suicide();
   void respawn();
 
diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
index 12331f9..2ce3286 100644
--- a/src/mds/MDSTableClient.cc
+++ b/src/mds/MDSTableClient.cc
@@ -65,18 +65,15 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
       }
     } 
     else if (pending_commit.count(tid)) {
-      dout(10) << "stray agree on " << reqid
-	       << " tid " << tid
-	       << ", already committing, resending COMMIT"
-	       << dendl;      
-      MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
-      mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+      dout(10) << "stray agree on " << reqid << " tid " << tid
+	       << ", already committing, will resend COMMIT" << dendl;
+      assert(!server_ready);
+      // will re-send commit when receiving the server ready message
     }
     else {
-      dout(10) << "stray agree on " << reqid
-	       << " tid " << tid
-	       << ", sending ROLLBACK"
-	       << dendl;      
+      dout(10) << "stray agree on " << reqid << " tid " << tid
+	       << ", sending ROLLBACK" << dendl;
+      assert(!server_ready);
       MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_ROLLBACK, 0, tid);
       mds->send_message_mds(req, mds->mdsmap->get_tableserver());
     }
@@ -102,6 +99,9 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
     break;
 
   case TABLESERVER_OP_SERVER_READY:
+    assert(!server_ready);
+    server_ready = true;
+
     if (last_reqid == ~0ULL)
       last_reqid = reqid;
 
@@ -144,26 +144,18 @@ void MDSTableClient::_prepare(bufferlist& mutation, version_t *ptid, bufferlist
   uint64_t reqid = ++last_reqid;
   dout(10) << "_prepare " << reqid << dendl;
 
-  // send message
-  MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
-  req->bl = mutation;
-
   pending_prepare[reqid].mutation = mutation;
   pending_prepare[reqid].ptid = ptid;
   pending_prepare[reqid].pbl = pbl;
   pending_prepare[reqid].onfinish = onfinish;
 
-  send_to_tableserver(req);
-}
-
-void MDSTableClient::send_to_tableserver(MMDSTableRequest *req)
-{
-  int ts = mds->mdsmap->get_tableserver();
-  if (mds->mdsmap->get_state(ts) >= MDSMap::STATE_CLIENTREPLAY)
-    mds->send_message_mds(req, ts);
-  else {
-    dout(10) << " deferring request to not-yet-active tableserver mds." << ts << dendl;
-  }
+  if (server_ready) {
+    // send message
+    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_PREPARE, reqid);
+    req->bl = mutation;
+    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  } else
+    dout(10) << "tableserver is not ready yet, deferring request" << dendl;
 }
 
 void MDSTableClient::commit(version_t tid, LogSegment *ls)
@@ -176,9 +168,12 @@ void MDSTableClient::commit(version_t tid, LogSegment *ls)
 
   assert(g_conf->mds_kill_mdstable_at != 4);
 
-  // send message
-  MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
-  send_to_tableserver(req);
+  if (server_ready) {
+    // send message
+    MMDSTableRequest *req = new MMDSTableRequest(table, TABLESERVER_OP_COMMIT, 0, tid);
+    mds->send_message_mds(req, mds->mdsmap->get_tableserver());
+  } else
+    dout(10) << "tableserver is not ready yet, deferring request" << dendl;
 }
 
 
@@ -228,3 +223,12 @@ void MDSTableClient::resend_prepares()
     mds->send_message_mds(req, mds->mdsmap->get_tableserver());
   }
 }
+
+void MDSTableClient::handle_mds_failure(int who)
+{
+  if (who != mds->mdsmap->get_tableserver())
+    return; // do nothing.
+
+  dout(7) << "tableserver mds." << who << " fails" << dendl;
+  server_ready = false;
+}
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
index 934f5fe..f8a84eb 100644
--- a/src/mds/MDSTableClient.h
+++ b/src/mds/MDSTableClient.h
@@ -30,6 +30,8 @@ protected:
 
   uint64_t last_reqid;
 
+  bool server_ready;
+
   // prepares
   struct _pending_prepare {
     Context *onfinish;
@@ -63,7 +65,8 @@ protected:
   void _logged_ack(version_t tid);
 
 public:
-  MDSTableClient(MDS *m, int tab) : mds(m), table(tab), last_reqid(~0ULL) {}
+  MDSTableClient(MDS *m, int tab) :
+    mds(m), table(tab), last_reqid(~0ULL), server_ready(false) {}
   virtual ~MDSTableClient() {}
 
   void handle_request(MMDSTableRequest *m);
@@ -85,7 +88,7 @@ public:
     ack_waiters[tid].push_back(c);
   }
 
-  void send_to_tableserver(MMDSTableRequest *req);
+  void handle_mds_failure(int mds);
 
   // child must implement
   virtual void resend_queries() = 0;
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* [PATCH] mds: don't roll back prepared table updates
  2013-04-01  8:51 ` [PATCH] mds: avoid sending duplicated table prepare/commit Yan, Zheng
@ 2013-04-01  8:51   ` Yan, Zheng
  0 siblings, 0 replies; 117+ messages in thread
From: Yan, Zheng @ 2013-04-01  8:51 UTC (permalink / raw)
  To: ceph-devel, sage, greg; +Cc: Yan, Zheng

From: "Yan, Zheng" <zheng.z.yan@intel.com>

When table server is recovering, it re-sends 'agree' messages for
prepared table updates. It is possible table client receives an
'agree' messages before it commits the corresponding update. Don't
send 'rollback' message back to the server in this case.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDSTableClient.cc | 11 ++++++++++-
 src/mds/MDSTableClient.h  |  1 +
 src/mds/MDSTableServer.cc |  9 ++++++---
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc
index 2ce3286..b478149 100644
--- a/src/mds/MDSTableClient.cc
+++ b/src/mds/MDSTableClient.cc
@@ -59,11 +59,17 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m)
       if (pending_prepare[reqid].pbl)
 	*pending_prepare[reqid].pbl = m->bl;
       pending_prepare.erase(reqid);
+      prepared_update[tid] = reqid;
       if (onfinish) {
         onfinish->finish(0);
         delete onfinish;
       }
-    } 
+    }
+    else if (prepared_update.count(tid)) {
+      dout(10) << "got duplicated agree on " << reqid << " atid " << tid << dendl;
+      assert(prepared_update[tid] == reqid);
+      assert(!server_ready);
+    }
     else if (pending_commit.count(tid)) {
       dout(10) << "stray agree on " << reqid << " tid " << tid
 	       << ", already committing, will resend COMMIT" << dendl;
@@ -162,6 +168,9 @@ void MDSTableClient::commit(version_t tid, LogSegment *ls)
 {
   dout(10) << "commit " << tid << dendl;
 
+  assert(prepared_update.count(tid));
+  prepared_update.erase(tid);
+
   assert(pending_commit.count(tid) == 0);
   pending_commit[tid] = ls;
   ls->pending_commit_tids[table].insert(tid);
diff --git a/src/mds/MDSTableClient.h b/src/mds/MDSTableClient.h
index f8a84eb..16b14c4 100644
--- a/src/mds/MDSTableClient.h
+++ b/src/mds/MDSTableClient.h
@@ -45,6 +45,7 @@ protected:
   };
 
   map<uint64_t, _pending_prepare> pending_prepare;
+  map<version_t, uint64_t> prepared_update;
   list<_pending_prepare> waiting_for_reqid;
 
   // pending commits
diff --git a/src/mds/MDSTableServer.cc b/src/mds/MDSTableServer.cc
index 00bea5e..b775246 100644
--- a/src/mds/MDSTableServer.cc
+++ b/src/mds/MDSTableServer.cc
@@ -120,10 +120,13 @@ void MDSTableServer::_commit_logged(MMDSTableRequest *req)
 void MDSTableServer::handle_rollback(MMDSTableRequest *req)
 {
   dout(7) << "handle_rollback " << *req << dendl;
-  _rollback(req->get_tid());
-  _note_rollback(req->get_tid());
+
+  version_t tid = req->get_tid();
+  assert(pending_for_mds.count(tid));
+  _rollback(tid);
+  _note_rollback(tid);
   mds->mdlog->start_submit_entry(new ETableServer(table, TABLESERVER_OP_ROLLBACK, 0, -1, 
-						  req->get_tid(), version));
+						  tid, version));
   req->put();
 }
 
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 117+ messages in thread

* Re: [PATCH 00/39] fixes for MDS cluster recovery
  2013-04-01  8:46 ` [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
@ 2013-04-01 17:00   ` Gregory Farnum
  0 siblings, 0 replies; 117+ messages in thread
From: Gregory Farnum @ 2013-04-01 17:00 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, Sage Weil

On Mon, Apr 1, 2013 at 1:46 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 03/17/2013 10:51 PM, Yan, Zheng wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> This serie fixes issues I encountered when running random MDS restart tests.
>> With these patches, my 3 MDS setup that runs fsstress + thrash_exports can
>> survive restarting one or two MDS dozens of times.
>>
>> But there still are lots of unsolved problems, Sometimes rstat corruption,
>> request hangs ...
>>
>> This patch series are also in:
>>   git://github.com/ukernel/ceph.git wip-mds
>>
>
> I rebased these patches. Replaced patch 5 and patch 6 with two new patches.
>
> [PATCH] mds: avoid sending duplicated table prepare/commit
> [PATCH] mds: don't roll back prepared table updates

All right, I've merged this whole series into master at last. Edited
one commit in order to give MMDSCacheRejoin the new versioned encoding
and added one commit to bump the MDS protocol like we talked about.
Thanks for all the fixes!
-Greg

^ permalink raw reply	[flat|nested] 117+ messages in thread

end of thread, other threads:[~2013-04-01 17:00 UTC | newest]

Thread overview: 117+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-03-17 14:51 [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
2013-03-17 14:51 ` [PATCH 01/39] mds: preserve subtree bounds until slave commit Yan, Zheng
2013-03-20 18:33   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 02/39] mds: process finished contexts in batch Yan, Zheng
2013-03-20 18:33   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 03/39] mds: fix MDCache::adjust_bounded_subtree_auth() Yan, Zheng
2013-03-20 18:33   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 04/39] mds: make sure table request id unique Yan, Zheng
2013-03-19 23:09   ` Greg Farnum
2013-03-20  5:53     ` Yan, Zheng
2013-03-20  6:15       ` Sage Weil
2013-03-20  6:24         ` Yan, Zheng
2013-03-20  6:49         ` Yan, Zheng
2013-03-20 18:31           ` Greg Farnum
2013-03-21  8:07             ` Yan, Zheng
2013-03-21 22:03               ` Gregory Farnum
2013-03-25 11:30                 ` Yan, Zheng
2013-03-29 22:12                   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 05/39] mds: send table request when peer is in proper state Yan, Zheng
2013-03-20 18:34   ` Greg Farnum
2013-03-29 21:58   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 06/39] mds: make table client/server tolerate duplicated message Yan, Zheng
2013-03-29 22:00   ` Gregory Farnum
2013-03-31 13:21     ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 07/39] mds: mark connection down when MDS fails Yan, Zheng
2013-03-20 18:37   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 08/39] mds: consider MDS as recovered when it reaches clientreply state Yan, Zheng
2013-03-20 18:40   ` Greg Farnum
2013-03-21  2:22     ` Yan, Zheng
2013-03-21 21:43       ` Gregory Farnum
2013-03-20 19:09   ` Greg Farnum
2013-03-17 14:51 ` [PATCH 09/39] mds: defer eval gather locks when removing replica Yan, Zheng
2013-03-20 19:36   ` Greg Farnum
2013-03-21  2:29     ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 10/39] mds: unify slave request waiting Yan, Zheng
2013-03-20 22:52   ` Sage Weil
2013-03-17 14:51 ` [PATCH 11/39] mds: don't delay processing replica buffer in slave request Yan, Zheng
2013-03-20 21:19   ` Greg Farnum
2013-03-21  2:38     ` Yan, Zheng
2013-03-21  4:15       ` Sage Weil
2013-03-21 21:48         ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 12/39] mds: compose and send resolve messages in batch Yan, Zheng
2013-03-20 21:45   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 13/39] mds: don't send resolve message between active MDS Yan, Zheng
2013-03-20 21:56   ` Gregory Farnum
2013-03-21  2:55     ` Yan, Zheng
2013-03-21 21:55       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 14/39] mds: set resolve/rejoin gather MDS set in advance Yan, Zheng
2013-03-20 22:09   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 15/39] mds: don't send MDentry{Link,Unlink} before receiving cache rejoin Yan, Zheng
2013-03-20 22:17   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 16/39] mds: send cache rejoin messages after gathering all resolves Yan, Zheng
2013-03-20 22:57   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 17/39] mds: send resolve acks after master updates are safely logged Yan, Zheng
2013-03-20 22:58   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 18/39] mds: fix MDS recovery involving cross authority rename Yan, Zheng
2013-03-21 17:59   ` Gregory Farnum
2013-03-22  3:04     ` Yan, Zheng
2013-03-29 22:02       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 19/39] mds: remove MDCache::rejoin_fetch_dirfrags() Yan, Zheng
2013-03-20 22:58   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong Yan, Zheng
2013-03-20 23:26   ` Gregory Farnum
2013-03-20 23:36     ` Sage Weil
2013-03-17 14:51 ` [PATCH 21/39] mds: encode dirfrag base in cache rejoin ack Yan, Zheng
2013-03-20 23:33   ` Gregory Farnum
2013-03-20 23:40     ` Gregory Farnum
2013-03-21  6:41     ` Yan, Zheng
2013-03-21 21:58       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 22/39] mds: handle linkage mismatch during cache rejoin Yan, Zheng
2013-03-21 21:23   ` Gregory Farnum
2013-03-22  3:05     ` Yan, Zheng
2013-03-25 16:14       ` Gregory Farnum
2013-03-26  7:21     ` Yan, Zheng
2013-03-29 22:09       ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 23/39] mds: reqid for rejoinning authpin/wrlock need to be list Yan, Zheng
2013-03-20 23:59   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 24/39] mds: take object's versionlock when rejoinning xlock Yan, Zheng
2013-03-21  0:37   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 25/39] mds: share inode max size after MDS recovers Yan, Zheng
2013-03-21  0:45   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 26/39] mds: issue caps when lock state in replica become SYNC Yan, Zheng
2013-03-21  0:52   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 27/39] mds: send lock action message when auth MDS is in proper state Yan, Zheng
2013-03-21  3:12   ` Gregory Farnum
2013-03-21  3:20     ` Yan, Zheng
2013-03-17 14:51 ` [PATCH 28/39] mds: add dirty imported dirfrag to LogSegment Yan, Zheng
2013-03-21  3:14   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 29/39] mds: avoid double auth pin for file recovery Yan, Zheng
2013-03-21  3:20   ` Gregory Farnum
2013-03-21  3:33     ` Yan, Zheng
2013-03-21  4:20       ` Sage Weil
2013-03-21 21:58     ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 30/39] mds: check MDS peer's state through mdsmap Yan, Zheng
2013-03-21  3:24   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 31/39] mds: unfreeze subtree if import aborts in PREPPED state Yan, Zheng
2013-03-21  3:27   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 32/39] mds: fix export cancel notification Yan, Zheng
2013-03-21  3:31   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 33/39] mds: notify bystanders if export aborts Yan, Zheng
2013-03-21  3:34   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 34/39] mds: don't open dirfrag while subtree is frozen Yan, Zheng
2013-03-21  3:38   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 35/39] mds: clear dirty inode rstat if import fails Yan, Zheng
2013-03-21  3:40   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 36/39] mds: try merging subtree after clear EXPORTBOUND Yan, Zheng
2013-03-21  3:44   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 37/39] mds: eval inodes with caps imported by cache rejoin message Yan, Zheng
2013-03-21  3:45   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 38/39] mds: don't replicate purging dentry Yan, Zheng
2013-03-21  3:46   ` Gregory Farnum
2013-03-17 14:51 ` [PATCH 39/39] mds: clear scatter dirty if replica inode has no auth subtree Yan, Zheng
2013-03-21  3:49   ` Gregory Farnum
2013-04-01  8:46 ` [PATCH 00/39] fixes for MDS cluster recovery Yan, Zheng
2013-04-01 17:00   ` Gregory Farnum
2013-04-01  8:51 ` [PATCH] mds: avoid sending duplicated table prepare/commit Yan, Zheng
2013-04-01  8:51   ` [PATCH] mds: don't roll back prepared table updates Yan, Zheng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.