diff options
Diffstat (limited to 'src/lib')
-rw-r--r-- | src/lib/sos/plugins/cluster.py | 115 | ||||
-rw-r--r-- | src/lib/sos/plugins/general.py | 4 | ||||
-rw-r--r-- | src/lib/sos/plugins/kernel.py | 8 | ||||
-rw-r--r-- | src/lib/sos/plugins/networking.py | 2 | ||||
-rw-r--r-- | src/lib/sos/plugins/process.py | 42 | ||||
-rw-r--r-- | src/lib/sos/plugins/rpm.py | 4 | ||||
-rw-r--r-- | src/lib/sos/plugintools.py | 31 | ||||
-rwxr-xr-x | src/lib/sos/policyredhat.py | 22 |
8 files changed, 160 insertions, 68 deletions
diff --git a/src/lib/sos/plugins/cluster.py b/src/lib/sos/plugins/cluster.py index 02fede4f..62315bc6 100644 --- a/src/lib/sos/plugins/cluster.py +++ b/src/lib/sos/plugins/cluster.py @@ -13,7 +13,7 @@ ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import sos.plugintools -import commands, os +import commands, os, re import time, libxml2 class cluster(sos.plugintools.PluginBase): @@ -22,7 +22,7 @@ class cluster(sos.plugintools.PluginBase): optionList = [("gfslockdump", 'gather output of gfs lockdumps', 'slow', False), ('lockdump', 'gather dlm lockdumps', 'slow', False), - ('taskdump', 'trigger 3 SysRq+t dumps every 5 seconds', 'slow', False)] + ('taskdump', 'trigger 3 sysrq+t dumps every 5 seconds (dangerous)', 'slow', False)] def checkenabled(self): # enable if any related package is installed @@ -33,7 +33,7 @@ class cluster(sos.plugintools.PluginBase): return True # enable if any related file is present - for fname in [ "/etc/cluster/cluster.conf" ]: + for fname in [ "/etc/cluster/cluster.conf", "/proc/cluster" ]: try: os.stat(fname) except:pass else: return True @@ -93,6 +93,13 @@ class cluster(sos.plugintools.PluginBase): if not ((self.cInfo["policy"].pkgByName("dlm") and self.cInfo["policy"].pkgByName("dlm-kernel")) or self.cInfo["policy"].pkgByName("gulm")): self.addDiagnose("required packages are missing: (dlm, dlm-kernel) || gulm") + # let's make modules are loaded + mods_check = [ "cman", "dlm" ] + if self.has_gfs(): mods_check.append("gfs") + for module in mods_check: + if len(self.fileGrep("^%s " % module, "/proc/modules")) == 0: + self.addDiagnose("required package is present but not loaded: %s" % module) + # check if all the needed daemons are active at sosreport time # check if they are started at boot time in RHEL4 RHCS (cman, ccsd, rgmanager, fenced) # and GFS (gfs, ccsd, clvmd, fenced) @@ -102,29 +109,59 @@ class cluster(sos.plugintools.PluginBase): status, output = commands.getstatusoutput("/sbin/service %s status" % service) if status: self.addDiagnose("service %s is not running" % service) + else: + # service is running, extra sanity checks + if service == "fenced": + # also make sure fenced is a registered cluster service + try: + if len(self.fileGrep("^Fence Domain:\W", "/proc/cluster/services")) == 0: + self.addDiagnose("fencing service is not registered with cman") + except: + pass + elif service == "rgmanager": + # also make sure rgmanager is a registered cluster service + try: + if len(self.fileGrep("^User:\W*usrm::manager", "/proc/cluster/services")) == 0: + self.addDiagnose("rgmanager is not registered with cman") + except: + pass if not self.cInfo["policy"].runlevelDefault() in self.cInfo["policy"].runlevelByService(service): self.addDiagnose("service %s is not started in default runlevel" % service) + # FIXME: any cman service whose state != run ? + # Fence Domain: "default" 2 2 run - + # is cluster quorate if not self.is_cluster_quorate(): self.addDiagnose("cluster node is not quorate") # if there is no cluster.conf, diagnose() finishes here. - try: os.stat("/etc/cluster/cluster.conf") - except: return + try: + os.stat("/etc/cluster/cluster.conf") + except: + self.addDiagnose("/etc/cluster/cluster.conf is missing") + return # setup XML xpath context xml = libxml2.parseFile("/etc/cluster/cluster.conf") xpathContext = xml.xpathNewContext() - # check fencing (warn on empty or manual) + # check fencing (warn on no fencing) if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[not(fence/method/device)]")): self.addDiagnose("one or more nodes have no fencing agent configured") + # check fencing (warn on manual) if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[/cluster/fencedevices/fencedevice[@agent='fence_manual']/@name=fence/method/device/@name]")): self.addDiagnose("one or more nodes have manual fencing agent configured (data integrity is not guaranteed)") + # if fence_ilo or fence_drac, make sure acpid is not running + hostname = commands.getoutput("/bin/uname -n").split(".")[0] + if len(xpathContext.xpathEval('/cluster/clusternodes/clusternode[@name = "%s" and /cluster/fencedevices/fencedevice[@agent="fence_rsa" or @agent="fence_drac"]/@name=fence/method/device/@name]' % hostname )): + status, output = commands.getstatusoutput("/sbin/service acpid status") + if status == 0 or self.cInfo["policy"].runlevelDefault() in self.cInfo["policy"].runlevelByService("acpid"): + self.addDiagnose("acpid is enabled, this may cause problems with your fencing method.") + # check for fs exported via nfs without nfsid attribute if len(xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]/nfsexport")): self.addDiagnose("one or more nfs file-system doesn't have a fsid attribute set.") @@ -142,16 +179,15 @@ class cluster(sos.plugintools.PluginBase): # and that the locking protocol is sane cluster_name = xpathContext.xpathEval("/cluster/@name")[0].content - fp = open("/etc/fstab","r") - for fs in fp.readline().split(): -# fs = fs.split() - if not fs or fs[0] == "#" or len(fs) < 6 or fs[2]!="gfs": continue - lockproto = get_gfs_sb_field(fs[1], "sb_lockproto") - if lockproto != "lock_" + get_locking_proto: + for fs in self.fileGrep(r'^[^#][/\w]*\W*[/\w]*\W*gfs', "/etc/fstab"): + # for each gfs entry + fs = fs.split() + + lockproto = self.get_gfs_sb_field(fs[0], "sb_lockproto") + if lockproto and lockproto != self.get_locking_proto(): self.addDiagnose("gfs mountpoint (%s) is using the wrong locking protocol (%s)" % (fs[0], lockproto) ) - locktable = get_gfs_sb_field(fs[1], "sb_locktable") - if not locktable: continue + locktable = self.get_gfs_sb_field(fs[0], "sb_locktable") try: locktable = locktable.split(":")[0] except: continue if locktable != cluster_name: @@ -178,11 +214,15 @@ class cluster(sos.plugintools.PluginBase): return def do_taskdump(self): + if not os.access("/proc/sysrq-trigger", os.W_OK): + return + commands.getstatusoutput("echo t > /proc/sysrq-trigger") - time.sleep(3) + time.sleep(5) commands.getstatusoutput("echo t > /proc/sysrq-trigger") - time.sleep(3) + time.sleep(5) commands.getstatusoutput("echo t > /proc/sysrq-trigger") + self.addCopySpec("/var/log/messages") def do_lockdump(self): @@ -200,22 +240,27 @@ class cluster(sos.plugintools.PluginBase): def get_locking_proto(self): # FIXME: what's the best way to find out ? - return "dlm" - return "gulm" + return "lock_dlm" + return "lock_gulm" def do_gfslockdump(self): fp = open("/proc/mounts","r") for line in fp.readlines(): mntline = line.split(" ") if mntline[2] == "gfs": - self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntline[1]) + self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntline[1], root_symlink = "gfs_lockdump_" + self.mangleCommand(mntline[1]) ) fp.close() + def do_rgmgr_bt(self): + # FIXME: threads backtrace + return + def postproc(self): self.doRegexSub("/etc/cluster/cluster.conf", r"(\s*\<fencedevice\s*.*\s*passwd\s*=\s*)\S+(\")", r"\1***") return def is_cluster_quorate(self): + # FIXME: use self.fileGrep() instead output = commands.getoutput("/bin/cat /proc/cluster/status | grep '^Membership state: '") try: if output[18:] == "Cluster-Member": @@ -226,32 +271,8 @@ class cluster(sos.plugintools.PluginBase): pass return None - def get_gfs_sb_field(self, mntpoint, field): - for line in commands.getoutput("/sbin/gfs_tool sb %s all" % fs[1]): - tostrip = " " + field + " = " - if line.startwith(tostrip): - return line[len(tostrip):] - return None - - def xpath_query_count(self, fname, query): - return len(self.xpath_query(fname, query)) - - def xpath_query(self, fname, query): - xml = libxml2.parseFile(fname) - xpathContext = xml.xpathNewContext() - return xpathContext.xpathEval(query) - - # FIXME: use python libxml internals - tmpout = commands.getoutput("/bin/echo xpath %s | /usr/bin/xmllint --shell /etc/cluster/cluster.conf") - for tmpline in tmpout: - if tmpline.startswith("Set contains "): - tmpvalue = tmpline[14:].split(" ")[0] - if tmpvalue == "NULL": return 0 - try: tmpvalue = int(tmpvalue) - except: return False - return tmpvalue + def get_gfs_sb_field(self, device, field): + for line in commands.getoutput("/sbin/gfs_tool sb %s all" % device).split("\n"): + if re.match('^\W*%s = ' % field, line): + return line.split("=")[1].strip() return False - - - - diff --git a/src/lib/sos/plugins/general.py b/src/lib/sos/plugins/general.py index 06f55228..fd18136b 100644 --- a/src/lib/sos/plugins/general.py +++ b/src/lib/sos/plugins/general.py @@ -16,10 +16,10 @@ import sos.plugintools import glob class general(sos.plugintools.PluginBase): - """very basic system information + """basic system information """ - optionList = [("syslogsize", "maximum size (in MiB) of logs to collect per syslog file", "", 15)] + optionList = [("syslogsize", "max size (MiB) to collect per syslog file", "", 15)] def setup(self): self.addCopySpec("/etc/redhat-release") diff --git a/src/lib/sos/plugins/kernel.py b/src/lib/sos/plugins/kernel.py index 07cc6ac7..9a752c13 100644 --- a/src/lib/sos/plugins/kernel.py +++ b/src/lib/sos/plugins/kernel.py @@ -18,8 +18,8 @@ import commands, os, re class kernel(sos.plugintools.PluginBase): """kernel related information """ - optionList = [("modinfo", 'Gathers module information on all modules', 'fast', True), - ('sysrq', 'Trigger SysRq dumps', 'fast', False)] + optionList = [("modinfo", 'gathers module information on all modules', 'fast', True), + ('sysrq', 'trigger SysRq+t dumps', 'fast', False)] moduleFile = "" taintList = [ {'regex':'mvfs*', 'description':'Clearcase module'}, @@ -72,9 +72,7 @@ class kernel(sos.plugintools.PluginBase): self.addCopySpec("/proc/cmdline") self.addCopySpec("/proc/driver") self.addCopySpec("/proc/sys/kernel/tainted") - # trigger some sysrq's. I'm not sure I like doing it this way, but - # since we end up with the sysrq dumps in syslog whether we run the - # syslog report before or after this, I suppose I can live with it. + # FIXME: both RHEL4 and RHEL5 don't need sysrq to be enabled to trigger via sysrq-trigger if self.isOptionEnabled('sysrq') and os.access("/proc/sysrq-trigger", os.W_OK) and os.access("/proc/sys/kernel/sysrq", os.R_OK): sysrq_state = commands.getoutput("/bin/cat /proc/sys/kernel/sysrq") commands.getoutput("/bin/echo 1 > /proc/sys/kernel/sysrq") diff --git a/src/lib/sos/plugins/networking.py b/src/lib/sos/plugins/networking.py index 6d11146b..b7331849 100644 --- a/src/lib/sos/plugins/networking.py +++ b/src/lib/sos/plugins/networking.py @@ -57,7 +57,7 @@ class networking(sos.plugintools.PluginBase): self.addCopySpec("/etc/xinetd.d") self.addCopySpec("/etc/host*") self.addCopySpec("/etc/resolv.conf") - ifconfigFile=self.collectExtOutput("/sbin/ifconfig -a", root_symlink = "ifconfig") + ifconfigFile=self.collectOutputNow("/sbin/ifconfig -a", root_symlink = "ifconfig") self.collectExtOutput("/sbin/route -n", root_symlink = "route") self.collectExtOutput("/sbin/ipchains -nvL") self.collectIPTable("filter") diff --git a/src/lib/sos/plugins/process.py b/src/lib/sos/plugins/process.py index 7ab6e6f5..baa185b4 100644 --- a/src/lib/sos/plugins/process.py +++ b/src/lib/sos/plugins/process.py @@ -13,6 +13,9 @@ ## Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import sos.plugintools +import commands +import time +import os class process(sos.plugintools.PluginBase): """process information @@ -20,8 +23,41 @@ class process(sos.plugintools.PluginBase): def setup(self): self.collectExtOutput("/bin/ps auxww", root_symlink = "ps") self.collectExtOutput("/usr/bin/pstree", root_symlink = "pstree") - self.collectExtOutput("/usr/bin/ipcs -a") - self.collectExtOutput("/usr/bin/ipcs -u") - self.collectExtOutput("/usr/bin/ipcs -l") return + def find_mountpoint(s): + if (os.path.ismount(s) or len(s)==0): return s + else: return mountpoint(os.path.split(s)[0]) + + def diagnose(self): + # check that no process is in state D + dpids = [] + status, output = commands.getstatusoutput("/bin/ps -A -o state,pid --no-heading") + if not status: + for line in output.split("\n"): + line = line.split() + if line[0] == "D": + # keep an eye on the process to see if the stat changes + for inc in range(1,10): + try: + if len(self.fileGrep("^State: D", " /proc/%d/status" % int(line[1]))) == 0: + # status is not D, good. let's get out of the loop. + time.sleep(0.1) + continue + except IOError: + # this should never happen... + pass + else: + dpids.append(int(line[1])) + + # FIXME: for each hung PID, list file-systems from /proc/$PID/fd +# for pid in dpids: +# realpath + + if len(dpids): + self.addDiagnose("one or more processes are in state D") + + return + + + diff --git a/src/lib/sos/plugins/rpm.py b/src/lib/sos/plugins/rpm.py index 9c6f2659..c899d141 100644 --- a/src/lib/sos/plugins/rpm.py +++ b/src/lib/sos/plugins/rpm.py @@ -17,8 +17,8 @@ import sos.plugintools class rpm(sos.plugintools.PluginBase): """RPM information """ - optionList = [("rpmq", "Queries for package information via rpm -q", "fast", True), - ("rpmva", "Runs a verify on all packages", "slow", True)] + optionList = [("rpmq", "queries for package information via rpm -q", "fast", True), + ("rpmva", "runs a verify on all packages", "slow", True)] def setup(self): self.addCopySpec("/var/log/rpmpkgs") diff --git a/src/lib/sos/plugintools.py b/src/lib/sos/plugintools.py index 8f07fb62..f58c8151 100644 --- a/src/lib/sos/plugintools.py +++ b/src/lib/sos/plugintools.py @@ -313,13 +313,27 @@ class PluginBase: """ self.collectProgs.append( (exe,suggest_filename,root_symlink) ) + def fileGrep(self, regexp, fname): + results = [] + + fp = open(fname, "r") + for line in fp.readlines(): + if re.match(regexp, line): + results.append(line) + fp.close() + return results + + def mangleCommand(self, exe): + # FIXME: this can be improved + mangledname = re.sub(r"^/(usr/|)(bin|sbin)/", "", exe) + mangledname = re.sub(r"[^\w\-\.\/]+", "_", mangledname) + mangledname = re.sub(r"/", ".", mangledname).strip(" ._-")[0:64] + return mangledname + def makeCommandFilename(self, exe): """ The internal function to build up a filename based on a command """ - mangledname = re.sub(r"[^\w\-\.\/]+", "_", exe) - mangledname = re.sub(r"/", ".", mangledname).strip(" ._-")[0:64] - - outfn = self.cInfo['cmddir'] + "/" + self.piName + "/" + mangledname + outfn = self.cInfo['cmddir'] + "/" + self.piName + "/" + self.mangleCommand(exe) # check for collisions if os.path.exists(outfn): @@ -341,6 +355,8 @@ class PluginBase: if not os.access(exe.split()[0], os.X_OK): self.soslog.log(logging.VERBOSE, "binary '%s' does not exist or is not runnable, trying anyways" % exe.split()[0]) + # FIXME: we should have a timeout or we may end waiting forever + # pylint: disable-msg = W0612 status, shout, runtime = sosGetCommandOutput(exe) @@ -363,16 +379,17 @@ class PluginBase: os.symlink(outfn[len(self.cInfo['dstroot'])+1:], root_symlink.strip("/.")) os.chdir(curdir) - outfn = outfn[len(self.cInfo['cmddir'])+1:] + outfn_strip = outfn[len(self.cInfo['cmddir'])+1:] else: self.soslog.log(logging.VERBOSE, "could not run command: %s" % exe) outfn = None + outfn_strip = None # sosStatus(status) # save info for later - self.executedCommands.append({'exe': exe, 'file':outfn}) # save in our list - self.cInfo['xmlreport'].add_command(cmdline=exe,exitcode=status,f_stdout=outfn,runtime=runtime) + self.executedCommands.append({'exe': exe, 'file':outfn_strip}) # save in our list + self.cInfo['xmlreport'].add_command(cmdline=exe,exitcode=status,f_stdout=outfn_strip,runtime=runtime) return outfn def writeTextToCommand(self, exe, text): diff --git a/src/lib/sos/policyredhat.py b/src/lib/sos/policyredhat.py index 7382f740..cbf76d26 100755 --- a/src/lib/sos/policyredhat.py +++ b/src/lib/sos/policyredhat.py @@ -24,6 +24,7 @@ import string from tempfile import gettempdir from sos.helpers import * import random +import re SOME_PATH = "/tmp/SomePath" @@ -114,9 +115,11 @@ class SosPolicy: try: name = raw_input("Please enter your first initial and last name [%s]: " % localname) + name = re.sub(r"[^a-zA-Z.0-9]", "", name) if len(name) == 0: name = localname ticketNumber = raw_input("Please enter the case number that you are generating this report for: ") + ticketNumber = re.sub(r"[^0-9]", "", ticketNumber) except KeyboardInterrupt: print _("<interrupted>") print _("Temporary files have been stored in %s") % self.cInfo['dstroot'] @@ -155,8 +158,25 @@ class SosPolicy: # FIXME: use python internal command os.system("/bin/mv %s %s" % (aliasdir, self.cInfo['dstroot'])) + # add last 6 chars from md5sum to file name + status, md5out = commands.getstatusoutput('''/usr/bin/md5sum "%s"''' % tarballName) + if not status and len(md5out): + oldtarballName = tarballName + try: + md5out = md5out.strip().split()[0] + tarballName = os.path.join(ourtempdir, "sosreport-%s-%s.tar.bz2" % (namestr, md5out[-6:]) ) + os.system("/bin/mv %s %s" % (oldtarballName, tarballName) ) + except: + md5out = False + else: + md5out = False + sys.stdout.write("\n") - print "Your sosreport has been generated and saved in %s" % tarballName + print "Your sosreport has been generated and saved in:\n %s" % tarballName + print + if md5out: + print "The md5sum is: " + md5out + print print "Please send this file to your support representative." sys.stdout.write("\n") |