Archived community.zenoss.org | full text search
Skip navigation
Currently Being Moderated

Make zenwin and zenwinmodeler ignore WMI errors

VERSION 1 
Created on: Sep 14, 2009 11:21 AM by Noel Brockett - Last Modified:  Sep 14, 2009 11:21 AM by Noel Brockett
At least in version 2.1.1, zenwin, zenwinmodeler, and zeneventlog have 
(IMO) a critical defect: if there are any /Status/WMI/Conn issues not in
history for the device, they ignore the device. On our network, for some
reason we end up with a lot of these events ('timegenerated' errors,
various intermittent failures to connect, etc.). This causes the
monitoring of our Windows servers to dramatically fall off as the system
runs, and we miss critical issues.

I changed the behavior of these three systems to go ahead and attempt
monitoring even if WMI issues are encountered. I learned that most of
the time these WMI issues are spurious and successful monitoring CAN
still be attempted. If you use this code, I recommend combining it with
event commands to restart the zenoss daemons when it finds them dead.

Also, in zenwin, I added/improved the exception handling; a failure to
create the watcher object occurs outside of a try block. Much of this
code is an attempt to keep zenwin from crashing if it tries to monitor a
Windows Server 2008 machine (Zenoss is not compatible with WS 2008 or
Vista's WMI interface, and zenwin cannot monitor services on these
devices). I ended up adding a hardcoded exclusion list so I can
otherwise monitor the machine but have zenwin skip it. For some reason,
zeneventlog seems to not crash, although it is not able to retrieve
events from the WS 2008 machine either.

I'd post the diff for the zeneventlog.py code as well but it's more
complicated and would require a bit more explanation.



$ZENHOME/Products/ZenWin/zenwinmodeler.py:
@@ -57,8 +57,11 @@
                 continue
             try:
                 if name in self.wmiprobs:
+                    """ 2007-12-14 - Attempt collection anyway as LOTS of devices have bad WMI states...
                     self.log.warn("skipping %s has bad wmi state", name)
                     continue
+                    """
+                    self.log.warn("%s has bad wmi state, processing anyway", name)
                 self.log.info("collecting from %s using user %s", name, user)
                 svcs = self.getServices(name, ip, user, passwd)
                 if not svcs:
@@ -74,7 +77,7 @@
             except pywintypes.com_error, e:
                 msg = self.printComErrorMessage(e)
                 if not msg:
-                    msg = "WMI connect error on %s: %s" % (name)
+                    msg = "WMI connect error on %s: " % (name)
                     code, txt, info, param = e
                     wmsg = "%s: %s" % (abs(code), txt)
                     if info:





$ZENHOME/Products/ZenWin/zenwin.py:
@@ -44,6 +44,7 @@

     def __init__(self):
         Base.__init__(self)
+        self.excludeDevices = []
         self.wmiprobs = []
         self.devices = []
         self.watchers = {}
@@ -95,30 +96,42 @@
             return None
         wql = "select Name from Win32_Service where State='Running'"
         w = self._wmi(srec)
-        w.connect()
-        svcs = [ svc.Name for svc in w.query(wql) ]
-        for name, (status, severity) in srec.services.items():
-            self.log.debug("service: %s status: %d", name, status)
-            if name not in svcs:
-                self.serviceStopped(srec, name)
-            elif status > 0:
-                self.serviceRunning(srec, name)
-        w.close()
+        try:
+            w.connect()
+            svcs = [ svc.Name for svc in w.query(wql) ]
+            for name, (status, severity) in srec.services.items():
+                self.log.debug("service: %s status: %d", name, status)
+                if name not in svcs:
+                    self.serviceStopped(srec, name)
+                elif status > 0:
+                    self.serviceRunning(srec, name)
+            w.close()
+        except Exception, ex:
+            self.log.warn("scanDevice(%s): %s", srec.name, str(ex))
+            self.closeWatcher(w)
+            raise

     def getWatcher(self, srec):
         wql = ("""SELECT * FROM __InstanceModificationEvent within 5 where """
                """TargetInstance ISA 'Win32_Service' """)
-        w = self._wmi(srec)
-        w.connect()
+        try:
+            w = self._wmi(srec)
+            w.connect()
+        except Exception, ex:
+            self.log.warn("getWatcher(%s): %s", srec.name, str(ex))
+            self.closeWatcher(w)
+            raise
         return w.watcher(wql)

     def processDevice(self, srec):
-        w = self.watchers.get(srec.name, None)
-        if not w:
-            self.scanDevice(srec)
-            self.deviceUp(srec)
-            self.watchers[srec.name] = w = self.getWatcher(srec)
+        self.log.info("Processing %s" % srec.name)
         try:
+            w = self.watchers.get(srec.name, None)
+            if not w:
+                self.log.info("Initializing %s, no existing watcher" % srec.name)
+                self.scanDevice(srec)
+                self.deviceUp(srec)
+                self.watchers[srec.name] = w = self.getWatcher(srec)
             self.log.debug("Querying %s", srec.name)
             s = w.nextEvent(100)
             self.deviceUp(srec)
@@ -137,12 +150,33 @@
                 self.log.debug("Codes: %r %r %r %r %r %r" % info)
                 scode = abs(scode)
             if scode != TIMEOUT_CODE:
+                self.log.warn("Codes: %r %r %r %r %r %r" % info)
                 self.deviceDown(srec, '%d: %s' % (code, txt))
+        except Exception, ex:
+            self.log.warn("processDevice(%s): %s", srec.name, str(ex))
+            self.deviceDown(srec, str(ex))
+
+    def closeWatcher(self, w, name=None):
+        try:
+            if w:
+                w.close()
+                w = None
+            if name and name in self.watchers:
+                del self.watchers[name]
+        except Exception, ex:
+            if not name: name = 'None'
+            self.log.warn("Exception closing watcher for %s: %s", name, str(ex))

     def processLoop(self):
         for device in self.devices:
             if device.name in self.wmiprobs:
-                self.log.debug("WMI problems on %s: skipping" % device.name)
+                #self.log.debug("WMI problems on %s: skipping" % device.name)
+                #continue
+               self.log.warn("WMI problems on %s: attempting processing anyway" % device.name)
+            if device.name in self.excludeDevices:
+                self.log.warn("Hardcoded exclusion of device %s", device.name)
                 continue
             try:
                 self.processDevice(device)
             except Exception, ex:
+                w = self.watchers.get(device.name, None)
+                self.closeWatcher(w, device.name)
                 self.deviceDown(device, str(ex))
Comments (0)