Trees | Indices | Help |
|
---|
|
1 #! /usr/bin/env python 2 ########################################################################### 3 # 4 # This program is part of Zenoss Core, an open source monitoring platform. 5 # Copyright (C) 2007, 2010 Zenoss Inc. 6 # 7 # This program is free software; you can redistribute it and/or modify it 8 # under the terms of the GNU General Public License version 2 or (at your 9 # option) any later version as published by the Free Software Foundation. 10 # 11 # For complete information please visit: http://www.zenoss.com/oss/ 12 # 13 ########################################################################### 14 15 __doc__="""zenperfsnmp 16 17 Gets SNMP performance data and stores it in RRD files. 18 19 """ 20 import random 21 import logging 22 log = logging.getLogger("zen.zenperfsnmp") 23 24 import Globals 25 import zope.interface 26 27 from twisted.internet import defer, error 28 from twisted.python.failure import Failure 29 from pynetsnmp.twistedsnmp import AgentProxy, snmpprotocol, Snmpv3Error 30 31 from Products.ZenCollector.daemon import CollectorDaemon 32 from Products.ZenCollector.interfaces import ICollectorPreferences,\ 33 IDataService,\ 34 IEventService,\ 35 IScheduledTask 36 from Products.ZenCollector.tasks import SimpleTaskFactory,\ 37 SimpleTaskSplitter,\ 38 TaskStates, \ 39 BaseTask 40 from Products.ZenUtils.Utils import importClass, readable_time 41 from Products.ZenUtils.Chain import Chain 42 from Products.ZenEvents.ZenEventClasses import Perf_Snmp, Status_Snmp, Status_Perf 43 from Products.ZenEvents import Event 44 45 # We retrieve our configuration data remotely via a Twisted PerspectiveBroker 46 # connection. To do so, we need to import the class that will be used by the 47 # configuration service to send the data over, i.e. SnmpDeviceProxy. 48 from Products.ZenUtils.Utils import unused 49 from Products.ZenHub.services.SnmpPerformanceConfig import SnmpDeviceProxy 50 unused(SnmpDeviceProxy) 51 from Products.ZenHub.services.PerformanceConfig import SnmpConnInfo 52 unused(SnmpConnInfo) 53 54 COLLECTOR_NAME = "zenperfsnmp" 55 MAX_BACK_OFF_MINUTES = 20 56 5759 zope.interface.implements(ICollectorPreferences) 6093 94 97 98 STATUS_EVENT = { 'eventClass' : Status_Snmp, 99 'component' : 'snmp', 100 'eventGroup' : 'SnmpTest' } 10162 """ 63 Constructs a new SnmpPerformanceCollectionPreferences instance and 64 provides default values for needed attributes. 65 """ 66 self.collectorName = COLLECTOR_NAME 67 self.defaultRRDCreateCommand = None 68 self.configCycleInterval = 20 # minutes 69 self.cycleInterval = 5 * 60 # seconds 70 71 # The configurationService attribute is the fully qualified class-name 72 # of our configuration service that runs within ZenHub 73 self.configurationService = 'Products.ZenHub.services.SnmpPerformanceConfig' 74 75 # Will be filled in based on buildOptions 76 self.options = None7779 parser.add_option('--showrawresults', 80 dest='showrawresults', 81 action="store_true", 82 default=False, 83 help="Show the raw RRD values. For debugging purposes only.") 84 85 parser.add_option('--maxbackoffminutes', 86 dest='maxbackoffminutes', 87 default=MAX_BACK_OFF_MINUTES, 88 help="When a device fails to respond, increase the time to" \ 89 " check on the device until this limit.")90103 """ 104 A task that performs periodic performance collection for devices providing 105 data via SNMP agents. 106 """ 107 zope.interface.implements(IScheduledTask) 108 109 STATE_CONNECTING = 'CONNECTING' 110 STATE_FETCH_PERF = 'FETCH_PERF_DATA' 111 STATE_STORE_PERF = 'STORE_PERF_DATA' 112474 475 476 if __name__ == '__main__': 477 myPreferences = SnmpPerformanceCollectionPreferences() 478 myTaskFactory = SimpleTaskFactory(SnmpPerformanceCollectionTask) 479 myTaskSplitter = SimpleTaskSplitter(myTaskFactory) 480 daemon = CollectorDaemon(myPreferences, myTaskSplitter) 481 daemon.run() 482118 """ 119 @param deviceId: the Zenoss deviceId to watch 120 @type deviceId: string 121 @param taskName: the unique identifier for this task 122 @type taskName: string 123 @param scheduleIntervalSeconds: the interval at which this task will be 124 collected 125 @type scheduleIntervalSeconds: int 126 @param taskConfig: the configuration for this task 127 """ 128 super(SnmpPerformanceCollectionTask, self).__init__( 129 deviceId, taskName, 130 taskConfig.cycleInterval, taskConfig 131 ) 132 133 # Needed for interface 134 self.name = taskName 135 self.configId = deviceId 136 self.state = TaskStates.STATE_IDLE 137 138 # The taskConfig corresponds to a DeviceProxy 139 self._device = taskConfig 140 self._devId = self._device.id 141 self._manageIp = self._device.snmpConnInfo.manageIp 142 self._maxOidsPerRequest = self._device.zMaxOIDPerRequest 143 log.debug("SnmpPerformanceCollectionTask.__init__: self._maxOidsPerRequest=%s" % self._maxOidsPerRequest) 144 self.interval = self._device.cycleInterval 145 self._singleOidMode = False 146 self._collectedOids = 0 147 148 self._dataService = zope.component.queryUtility(IDataService) 149 self._eventService = zope.component.queryUtility(IEventService) 150 151 self._preferences = zope.component.queryUtility(ICollectorPreferences, 152 COLLECTOR_NAME) 153 154 self._snmpProxy = None 155 self._snmpConnInfo = self._device.snmpConnInfo 156 self._oids = self._device.oids 157 self._snmpStatusFailures = 0 158 self._snmpPort = snmpprotocol.port() 159 self._maxbackoffseconds = self._preferences.options.maxbackoffminutes * 60 160 161 self._lastErrorMsg = ''162164 """ 165 Twisted errBack to log the exception for a single device. 166 167 @parameter reason: explanation of the failure 168 @type reason: Twisted error instance 169 """ 170 self._snmpStatusFailures += 1 171 # Decode the exception 172 if isinstance(reason.value, error.TimeoutError): 173 msg = ('SNMP agent down (%s second timeout connecting to' 174 ' device %s)') % (self._snmpConnInfo.zSnmpTimeout, self._devId) 175 # Indicate that we've handled the error by 176 # not returning a result 177 reason = None 178 179 elif isinstance(reason.value, Snmpv3Error): 180 msg = ("Cannot connect to SNMP agent on {0._devId}: {1.value}").format(self, reason) 181 reason = None 182 183 elif isinstance(reason.value, SingleOidSwitchException): 184 return # Just wait for the next cycle 185 186 else: 187 msg = reason.getErrorMessage() 188 if not msg: # Sometimes we get blank error messages 189 msg = reason.__class__ 190 msg = '%s %s' % (self._devId, msg) 191 192 # Leave 'reason' alone to generate a traceback 193 194 if self._lastErrorMsg != msg: 195 self._lastErrorMsg = msg 196 if msg: 197 log.error(msg) 198 199 self._eventService.sendEvent(STATUS_EVENT, 200 device=self._devId, 201 summary=msg, 202 severity=Event.Error) 203 self._delayNextCheck() 204 205 return reason206208 """ 209 Callback called after a successful connect to the remote device. 210 """ 211 # If we want to model things first before doing collection, 212 # that code goes here. 213 log.debug("Connected to %s [%s]", self._devId, self._manageIp) 214 self._collectedOids = 0 215 return result216218 """ 219 Get performance data for all the monitored components on a device 220 221 @parameter ignored: required to keep Twisted's callback chain happy 222 @type ignored: result of previous callback 223 """ 224 self.state = SnmpPerformanceCollectionTask.STATE_FETCH_PERF 225 if not self._oids: 226 return defer.succeed(([])) 227 228 # Either get as many OIDs as we can or one-by-one 229 oidsPerRequest = self._maxOidsPerRequest if not self._singleOidMode else 1 230 log.debug("Retrieving OIDs from %s [%s] oidsPerRequest=%s", self._devId, self._manageIp, oidsPerRequest) 231 232 d = Chain(self._get, iter(self.chunk(self._oids.keys(), oidsPerRequest))).run() 233 d.addCallback(self._checkOidResults) 234 d.addCallback(self._storeOidResults) 235 d.addCallback(self._updateStatus) 236 d.addErrback(self._failure) 237 return d238240 """ 241 Decode responses from the device and sanity check the responses 242 243 @parameter results: results of SNMP gets 244 @type results: array of (boolean, dictionaries) 245 """ 246 if not results: 247 summary = 'Unable to retrieve OIDs from device %s' % \ 248 self._devId 249 self._eventService.sendEvent(STATUS_EVENT, 250 device=self._devId, 251 summary=summary, 252 severity=Event.Error) 253 log.info(summary) 254 return defer.fail(summary) 255 256 # Look for problems 257 for success, update in results: 258 # empty update is probably a bad OID in the request somewhere 259 if success and not update and not self._singleOidMode: 260 self._singleOidMode = True 261 msg = 'Error collecting data on %s -- retrying in single-OID mode' % \ 262 self._devId 263 log.warn(msg) 264 return defer.fail(SingleOidSwitchException(msg)) # Wait for the next cycle 265 266 if not success: 267 if isinstance(update, Failure) and \ 268 isinstance(update.value, (error.TimeoutError, Snmpv3Error)): 269 return defer.fail(update) 270 else: 271 log.warning('Failed to collect on %s (%s: %s)', 272 self._devId, 273 update.__class__, 274 update) 275 return results276278 """ 279 Store the OID values in RRD files 280 281 @parameter results: results of SNMP gets 282 @type results: array of (boolean, dictionaries) 283 """ 284 self.state = SnmpPerformanceCollectionTask.STATE_STORE_PERF 285 oidsReceived = set() 286 successCount = 0 287 for success, update in results: 288 if not success: 289 continue 290 291 successCount += 1 292 293 # Casting update to a dict here is unnecessary in all known cases. 294 # See ticket #7347 for a bug where update would be a tuple at this 295 # point instead of a dict. This cast fixes that problem. 296 for oid, value in dict(update).items(): 297 oid = oid.strip('.') 298 if oid not in self._oids: 299 log.error("OID %s is not in %s", oid, self._oids.keys()) 300 continue 301 302 # We should always get something useful back 303 if value == '' or value is None: 304 log.debug("Got bad value: oid=%s value=%s" % (oid, value)) 305 self._badOid(oid) 306 continue 307 308 self._collectedOids += 1 309 oidsReceived.add(oid) 310 # An OID's data can be stored multiple times 311 for rrdMeta in self._oids[oid]: 312 cname, path, rrdType, rrdCommand, rrdMin, rrdMax = rrdMeta 313 self._dataService.writeRRD(path, value, rrdType, 314 rrdCommand=rrdCommand, 315 min=rrdMin, max=rrdMax) 316 317 if successCount == len(results) and self._singleOidMode: 318 # Remove any oids that didn't report 319 for doomed in set(self._oids.keys()) - oidsReceived: 320 log.debug("Removing OID %s (no response)" % doomed) 321 self._badOid(doomed) 322 323 success = True 324 if results: 325 success = successCount > 0 326 327 return success328330 """ 331 Callback activated when the task is complete 332 333 @parameter result: results of SNMP gets 334 @type result: array of (boolean, dictionaries) 335 """ 336 if not isinstance(result, Failure): 337 log.debug("Device %s [%s] %d of %d OIDs scanned successfully", 338 self._devId, self._manageIp, self._collectedOids, 339 len(self._oids.keys())) 340 self._returnToNormalSchedule() 341 else: 342 log.debug("Device %s [%s] scanned failed, %s", 343 self._devId, self._manageIp, result.getErrorMessage()) 344 345 try: 346 self._close() 347 except Exception, ex: 348 log.warn("Failed to close device %s: error %s" % 349 (self._devId, str(ex))) 350 351 # Return the result so the framework can track success/failure 352 return result353355 return self._close()356358 """ 359 Contact to one device and return a deferred which gathers data from 360 the device. 361 362 @return: A task to scan the OIDs on a device. 363 @rtype: Twisted deferred object 364 """ 365 # See if we need to connect first before doing any collection 366 d = defer.maybeDeferred(self._connect) 367 d.addCallbacks(self._connectCallback, self._failure) 368 d.addCallback(self._fetchPerf) 369 370 # Call _finished for both success and error scenarois 371 d.addBoth(self._finished) 372 373 # Wait until the Deferred actually completes 374 return d375377 """ 378 Perform SNMP get for specified OIDs 379 380 @parameter oids: OIDs to gather 381 @type oids: list of strings 382 @return: Twisted deferred 383 @rtype: Twisted deferred 384 """ 385 return self._snmpProxy.get(oids, 386 self._snmpConnInfo.zSnmpTimeout, 387 self._snmpConnInfo.zSnmpTries)388390 """ 391 Create a connection to the remote device 392 """ 393 self.state = SnmpPerformanceCollectionTask.STATE_CONNECTING 394 if (self._snmpProxy is None or 395 self._snmpProxy._snmpConnInfo != self._snmpConnInfo): 396 self._snmpProxy = self._snmpConnInfo.createSession( 397 protocol=self._snmpPort.protocol, 398 allowCache=True) 399 self._snmpProxy.open() 400 log.debug("SnmpPerformanceCollectionTask._connect: Connected to %s" % self._snmpConnInfo.manageIp) 401 return self._snmpProxy402404 """ 405 Close down the connection to the remote device 406 """ 407 if self._snmpProxy: 408 self._snmpProxy.close() 409 self._snmpProxy = None410412 """ 413 Send up/down events based on SNMP results 414 415 @parameter success: Did everything work? 416 @type success: boolean 417 """ 418 if success: 419 # As we might not be the process that detected 420 # something was down, always send clear events. 421 # These are deduped out by the daemon code. 422 summary = 'Gathered all OIDs' 423 self._eventService.sendEvent(STATUS_EVENT, 424 device=self._devId, summary=summary, 425 severity=Event.Clear) 426 if self._snmpStatusFailures > 0: 427 log.info("%s %s", self._devId, summary) 428 self._snmpStatusFailures = 0 429 430 if not self._lastErrorMsg: 431 log.info("%s returned back to normal operations", 432 self._devId) 433 self._lastErrorMsg = '' 434 if self.interval != self._device.cycleInterval: 435 # Setting the value kicks off observers, so don't 436 # reset unless necessary 437 self.interval = self._device.cycleInterval 438 439 else: 440 summary = 'Failed to collect all OIDs' 441 self._eventService.sendEvent(STATUS_EVENT, 442 device=self._devId, summary=summary, 443 severity=Event.Warning) 444 log.debug("%s %s", self._devId, summary) 445 self._snmpStatusFailures += 1 446 447 return defer.succeed(self._snmpStatusFailures)448450 """ 451 Report any bad OIDs and then remove the OID so we 452 don't generate any further errors. 453 454 @parameter oid: the OID that is not responding 455 @type oid: string 456 """ 457 names = [dp[0] for dp in self._oids[oid]] 458 summary = 'Error reading value for %s (%s) on %s' % ( 459 names, oid, self._devId) 460 log.warn(summary) 461 462 del self._oids[oid]463 465 """ 466 Called by the collector framework scheduler, and allows us to 467 see how each task is doing. 468 """ 469 display = "%s OIDs: %d inSingleOidMode: %s\n" % ( 470 self.name, len(self._oids.keys()), self._singleOidMode) 471 if self._lastErrorMsg: 472 display += "%s\n" % self._lastErrorMsg 473 return display
Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0.1.1812 on Tue Oct 11 12:51:57 2011 | http://epydoc.sourceforge.net |