diff options
Diffstat (limited to 'src/lib/sos/plugins/cluster.py')
-rw-r--r-- | src/lib/sos/plugins/cluster.py | 115 |
1 files changed, 68 insertions, 47 deletions
diff --git a/src/lib/sos/plugins/cluster.py b/src/lib/sos/plugins/cluster.py index 02fede4f..62315bc6 100644 --- a/src/lib/sos/plugins/cluster.py +++ b/src/lib/sos/plugins/cluster.py @@ -13,7 +13,7 @@ ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import sos.plugintools -import commands, os +import commands, os, re import time, libxml2 class cluster(sos.plugintools.PluginBase): @@ -22,7 +22,7 @@ class cluster(sos.plugintools.PluginBase): optionList = [("gfslockdump", 'gather output of gfs lockdumps', 'slow', False), ('lockdump', 'gather dlm lockdumps', 'slow', False), - ('taskdump', 'trigger 3 SysRq+t dumps every 5 seconds', 'slow', False)] + ('taskdump', 'trigger 3 sysrq+t dumps every 5 seconds (dangerous)', 'slow', False)] def checkenabled(self): # enable if any related package is installed @@ -33,7 +33,7 @@ class cluster(sos.plugintools.PluginBase): return True # enable if any related file is present - for fname in [ "/etc/cluster/cluster.conf" ]: + for fname in [ "/etc/cluster/cluster.conf", "/proc/cluster" ]: try: os.stat(fname) except:pass else: return True @@ -93,6 +93,13 @@ class cluster(sos.plugintools.PluginBase): if not ((self.cInfo["policy"].pkgByName("dlm") and self.cInfo["policy"].pkgByName("dlm-kernel")) or self.cInfo["policy"].pkgByName("gulm")): self.addDiagnose("required packages are missing: (dlm, dlm-kernel) || gulm") + # let's make modules are loaded + mods_check = [ "cman", "dlm" ] + if self.has_gfs(): mods_check.append("gfs") + for module in mods_check: + if len(self.fileGrep("^%s " % module, "/proc/modules")) == 0: + self.addDiagnose("required package is present but not loaded: %s" % module) + # check if all the needed daemons are active at sosreport time # check if they are started at boot time in RHEL4 RHCS (cman, ccsd, rgmanager, fenced) # and GFS (gfs, ccsd, clvmd, fenced) @@ -102,29 +109,59 @@ class cluster(sos.plugintools.PluginBase): status, output = commands.getstatusoutput("/sbin/service %s status" % service) if status: self.addDiagnose("service %s is not running" % service) + else: + # service is running, extra sanity checks + if service == "fenced": + # also make sure fenced is a registered cluster service + try: + if len(self.fileGrep("^Fence Domain:\W", "/proc/cluster/services")) == 0: + self.addDiagnose("fencing service is not registered with cman") + except: + pass + elif service == "rgmanager": + # also make sure rgmanager is a registered cluster service + try: + if len(self.fileGrep("^User:\W*usrm::manager", "/proc/cluster/services")) == 0: + self.addDiagnose("rgmanager is not registered with cman") + except: + pass if not self.cInfo["policy"].runlevelDefault() in self.cInfo["policy"].runlevelByService(service): self.addDiagnose("service %s is not started in default runlevel" % service) + # FIXME: any cman service whose state != run ? + # Fence Domain: "default" 2 2 run - + # is cluster quorate if not self.is_cluster_quorate(): self.addDiagnose("cluster node is not quorate") # if there is no cluster.conf, diagnose() finishes here. - try: os.stat("/etc/cluster/cluster.conf") - except: return + try: + os.stat("/etc/cluster/cluster.conf") + except: + self.addDiagnose("/etc/cluster/cluster.conf is missing") + return # setup XML xpath context xml = libxml2.parseFile("/etc/cluster/cluster.conf") xpathContext = xml.xpathNewContext() - # check fencing (warn on empty or manual) + # check fencing (warn on no fencing) if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[not(fence/method/device)]")): self.addDiagnose("one or more nodes have no fencing agent configured") + # check fencing (warn on manual) if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[/cluster/fencedevices/fencedevice[@agent='fence_manual']/@name=fence/method/device/@name]")): self.addDiagnose("one or more nodes have manual fencing agent configured (data integrity is not guaranteed)") + # if fence_ilo or fence_drac, make sure acpid is not running + hostname = commands.getoutput("/bin/uname -n").split(".")[0] + if len(xpathContext.xpathEval('/cluster/clusternodes/clusternode[@name = "%s" and /cluster/fencedevices/fencedevice[@agent="fence_rsa" or @agent="fence_drac"]/@name=fence/method/device/@name]' % hostname )): + status, output = commands.getstatusoutput("/sbin/service acpid status") + if status == 0 or self.cInfo["policy"].runlevelDefault() in self.cInfo["policy"].runlevelByService("acpid"): + self.addDiagnose("acpid is enabled, this may cause problems with your fencing method.") + # check for fs exported via nfs without nfsid attribute if len(xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]/nfsexport")): self.addDiagnose("one or more nfs file-system doesn't have a fsid attribute set.") @@ -142,16 +179,15 @@ class cluster(sos.plugintools.PluginBase): # and that the locking protocol is sane cluster_name = xpathContext.xpathEval("/cluster/@name")[0].content - fp = open("/etc/fstab","r") - for fs in fp.readline().split(): -# fs = fs.split() - if not fs or fs[0] == "#" or len(fs) < 6 or fs[2]!="gfs": continue - lockproto = get_gfs_sb_field(fs[1], "sb_lockproto") - if lockproto != "lock_" + get_locking_proto: + for fs in self.fileGrep(r'^[^#][/\w]*\W*[/\w]*\W*gfs', "/etc/fstab"): + # for each gfs entry + fs = fs.split() + + lockproto = self.get_gfs_sb_field(fs[0], "sb_lockproto") + if lockproto and lockproto != self.get_locking_proto(): self.addDiagnose("gfs mountpoint (%s) is using the wrong locking protocol (%s)" % (fs[0], lockproto) ) - locktable = get_gfs_sb_field(fs[1], "sb_locktable") - if not locktable: continue + locktable = self.get_gfs_sb_field(fs[0], "sb_locktable") try: locktable = locktable.split(":")[0] except: continue if locktable != cluster_name: @@ -178,11 +214,15 @@ class cluster(sos.plugintools.PluginBase): return def do_taskdump(self): + if not os.access("/proc/sysrq-trigger", os.W_OK): + return + commands.getstatusoutput("echo t > /proc/sysrq-trigger") - time.sleep(3) + time.sleep(5) commands.getstatusoutput("echo t > /proc/sysrq-trigger") - time.sleep(3) + time.sleep(5) commands.getstatusoutput("echo t > /proc/sysrq-trigger") + self.addCopySpec("/var/log/messages") def do_lockdump(self): @@ -200,22 +240,27 @@ class cluster(sos.plugintools.PluginBase): def get_locking_proto(self): # FIXME: what's the best way to find out ? - return "dlm" - return "gulm" + return "lock_dlm" + return "lock_gulm" def do_gfslockdump(self): fp = open("/proc/mounts","r") for line in fp.readlines(): mntline = line.split(" ") if mntline[2] == "gfs": - self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntline[1]) + self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntline[1], root_symlink = "gfs_lockdump_" + self.mangleCommand(mntline[1]) ) fp.close() + def do_rgmgr_bt(self): + # FIXME: threads backtrace + return + def postproc(self): self.doRegexSub("/etc/cluster/cluster.conf", r"(\s*\<fencedevice\s*.*\s*passwd\s*=\s*)\S+(\")", r"\1***") return def is_cluster_quorate(self): + # FIXME: use self.fileGrep() instead output = commands.getoutput("/bin/cat /proc/cluster/status | grep '^Membership state: '") try: if output[18:] == "Cluster-Member": @@ -226,32 +271,8 @@ class cluster(sos.plugintools.PluginBase): pass return None - def get_gfs_sb_field(self, mntpoint, field): - for line in commands.getoutput("/sbin/gfs_tool sb %s all" % fs[1]): - tostrip = " " + field + " = " - if line.startwith(tostrip): - return line[len(tostrip):] - return None - - def xpath_query_count(self, fname, query): - return len(self.xpath_query(fname, query)) - - def xpath_query(self, fname, query): - xml = libxml2.parseFile(fname) - xpathContext = xml.xpathNewContext() - return xpathContext.xpathEval(query) - - # FIXME: use python libxml internals - tmpout = commands.getoutput("/bin/echo xpath %s | /usr/bin/xmllint --shell /etc/cluster/cluster.conf") - for tmpline in tmpout: - if tmpline.startswith("Set contains "): - tmpvalue = tmpline[14:].split(" ")[0] - if tmpvalue == "NULL": return 0 - try: tmpvalue = int(tmpvalue) - except: return False - return tmpvalue + def get_gfs_sb_field(self, device, field): + for line in commands.getoutput("/sbin/gfs_tool sb %s all" % device).split("\n"): + if re.match('^\W*%s = ' % field, line): + return line.split("=")[1].strip() return False - - - - |