can't read/write after inserting new crushmap

From: huang jun <hjwsm1989@gmail.com>
To: ceph-devel <ceph-devel@vger.kernel.org>
Subject: can't read/write after inserting new crushmap
Date: Wed, 8 Feb 2012 17:58:32 +0800	[thread overview]
Message-ID: <CABAwU-ZZ2Nz20i0SjWT2_1USJRToTELqSU++5pz4MOcH1g7EGg@mail.gmail.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 3653 bytes --]

hi,all
    we test with 8 OSDs and group into 2 racks, each has 4 OSDs.
    write the crushmap file and export it into ceph cluster.(crushmap
file attached)
    all PGs distribute allow the crush rule.

   then one group(4OSDs) powered off  and "ceph -w "shows:
         2012-02-08 17:03:57.518285    pg v633: 1584 pgs: 1092 active,
490 active+clean+degraded, 2 degraded+peering; 3992 MB data, 6665 MB
used, 4642 GB / 4657 GB      avail; 349/2040 degraded (17.108%)
         2012-02-08 17:03:57.520698   mds e4: 1/1/1 up {0=0=up:active}
         2012-02-08 17:03:57.520729   osd e86: 8 osds: 4 up, 4 in
         2012-02-08 17:03:57.521199   log 2012-02-08 15:26:21.761073
mon0 192.168.0.116:6789/0 27 : [INF] osd7 out (down for 304.299392)
         2012-02-08 17:03:57.521249   mon e1: 1 mons at {0=192.168.0.116:6789/0}
   2 PGs' state is "degraded+peering", and it seems never goto normal
"active+clean " state.(we use ceph v0.35,maybe it matters)

 check the pg dump output:
        2.1p3   0       0       0       0       0       0       0
 0       active  0'0     4'120   [3]     [3,0]   0'0
        2.0p2   0       0       0       0       0       0       0
 0       active  0'0     3'119   [2]     [2,0]   0'0
        0.1p1   0       0       0       0       0       0       0
 0       active  0'0     3'103   [1]     [1,0]   0'0
        0.0p0   0       0       0       0       0       0       0
 0       active  0'0     2'126   [0]     [0,1]   0'0
        1.1p0   0       0       0       0       0       0       0
 0       active  0'0     2'123   [0]     [0,2]   0'0
        1.0p1   0       0       0       0       0       0       0
 0       active  0'0     3'102   [1]     [1,0]   0'0
        2.0p3   0       0       0       0       0       0       0
 0       active  0'0     4'122   [3]     [3,0]   0'0
        2.1p2   0       0       0       0       0       0       0
 0       active  0'0     3'122   [2]     [2,0]   0'0
        0.0p1   0       0       0       0       0       0       0
 0       active  0'0     3'116   [1]     [1,0]   0'0
        0.1p0   0       0       0       0       0       0       0
 0       active  0'0     2'115   [0]     [0,2]   0'0
        1.0p0   0       0       0       0       0       0       0
 0       active  0'0     2'115   [0]     [0,2]   0'0
        1.1p1   0       0       0       0       0       0       0
 0       active  0'0     3'116   [1]     [1,0]   0'0
        2.1p1   0       0       0       0       0       0       0
 0       active  0'0     3'121   [1]     [1,0]   0'0
        2.0p0   0       0       0       0       0       0       0
 0       active  0'0     2'121   [0]     [0,2]   0'0
        0.1p3   0       0       0       0       0       0       0
 0       active  0'0     4'115   [3]     [3,0]   0'0
        0.0p2   0       0       0       0       0       0       0
 0       active  0'0     3'117   [2]     [2,0]   0'0
        1.1p2   0       0       0       0       0       0       0
 0       active  0'0     3'119   [2]     [2,0]   0'0
        1.0p3   0       0       0       0       0       0       0
 0       active  0'0     4'115   [3]     [3,0]   0'0
         2.0p1   0       0       0       0       0       0       0
  0       active  0'0     3'116   [1]     [1,0]   0'0
         2.1p0   0       0       0       0       0       0       0
  0       active  0'0     2'124   [0]     [0,1]   0'0
    let's take pg 2.1p3 for example,
    why  the up and acting set are not equals?
    Does  data migration  occurs in OSD cluster on this condition?
that is what we are most concerned about.
    if so, the data distribution didn't follow the rules setted by crushmap.
-- 
thanks,
huangjun

[-- Attachment #2: crush.txt --]
[-- Type: text/plain, Size: 1718 bytes --]

# begin crush map

# devices
device 0 device0
device 1 device1
device 2 device2
device 3 device3
device 4 device4
device 5 device5
device 6 device6
device 7 device7

# types
type 0 device
type 1 host
type 2 rack
type 3 root

# buckets
host host0 {
	id -1		# do not change unnecessarily
	# weight 2.000
	alg straw
	hash 0	# rjenkins1
	item device0 weight 1.000
	item device1 weight 1.000
}
host host1 {
	id -2		# do not change unnecessarily
	# weight 2.000
	alg straw
	hash 0	# rjenkins1
	item device2 weight 1.000
	item device3 weight 1.000
}
host host2 {
	id -3		# do not change unnecessarily
	# weight 2.000
	alg straw
	hash 0	# rjenkins1
	item device4 weight 1.000
	item device5 weight 1.000
}
host host3 {
	id -4		# do not change unnecessarily
	# weight 2.000
	alg straw
	hash 0	# rjenkins1
	item device6 weight 1.000
	item device7 weight 1.000
}
rack rack0 {
	id -5		# do not change unnecessarily
	# weight 4.000
	alg straw
	hash 0	# rjenkins1
	item host0 weight 2.000
	item host1 weight 2.000
}
rack rack1 {
	id -6		# do not change unnecessarily
	# weight 4.000
	alg straw
	hash 0	# rjenkins1
	item host2 weight 2.000
	item host3 weight 2.000
}
root root {
	id -7		# do not change unnecessarily
	# weight 8.000
	alg straw
	hash 0	# rjenkins1
	item rack0 weight 4.000
	item rack1 weight 4.000
}

# rules
rule data {
	ruleset 0
	type replicated
	min_size 2
	max_size 2
	step take root
	step chooseleaf firstn 0 type rack
	step emit
}
rule metadata {
	ruleset 1
	type replicated
	min_size 2
	max_size 2
	step take root
	step chooseleaf firstn 0 type rack
	step emit
}
rule rbd {
	ruleset 2
	type replicated
	min_size 2
	max_size 2
	step take root
	step chooseleaf firstn 0 type rack
	step emit
}


# end crush map