Hmmm…
Thank you - but I don´t get it…
I copied the /opt/omd/sites/<mysite>/local/share/check_mk/agents/plugins/nvidia_smi
file from the CheckMK server to /usr/lib/check_mk_agent/plugins/nvidia_smi
on the CheckMK client.
Isn´t that what we are supposed to do ?
And that file has the following content:
#!/usr/bin/python3
# -*- encoding: utf-8; py-indent-offset: 4 -*-
# +------------------------------------------------------------------+
# | ____ _ _ __ __ _ __ |
# | / ___| |__ ___ ___| | __ | \/ | |/ / |
# | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
# | | |___| | | | __/ (__| < | | | | . \ |
# | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
# | |
# | Copyright Mathias Kettner 2012 mk@mathias-kettner.de |
# +------------------------------------------------------------------+
#
# This file is part of Check_MK.
# The official homepage is at http://mathias-kettner.de/check_mk.
#
# check_mk is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation in version 2. check_mk is distributed
# in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
# out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE. See the GNU General Public License for more de-
# ails. You should have received a copy of the GNU General Public
# License along with GNU Make; see the file COPYING. If not, write
# to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA 02110-1301 USA.
#######################################
# Check developed by
#######################################
# Dr. Markus Hillenbrand
# University of Kaiserslautern, Germany
# hillenbr@rhrk.uni-kl.de
#######################################
#######################################
# Script modified by S M Raju
#######################################
# the inventory functions
def inventory_nvidia_smi_fan(info):
inventory = []
for line in info:
if line[2] != 'N/A':
inventory.append( ("GPU"+line[0], "", None) )
return inventory
def inventory_nvidia_smi_gpuutil(info):
inventory = []
for line in info:
if line[3] != 'N/A':
inventory.append( ("GPU"+line[0], "", None) )
return inventory
def inventory_nvidia_smi_memutil(info):
inventory = []
for line in info:
if line[4] != 'N/A':
inventory.append( ("GPU"+line[0], "", None) )
return inventory
def inventory_nvidia_smi_errors1(info):
inventory = []
for line in info:
if line[5] != 'N/A':
inventory.append( ("GPU"+line[0], "", None) )
return inventory
def inventory_nvidia_smi_errors2(info):
inventory = []
for line in info:
if line[6] != 'N/A':
inventory.append( ("GPU"+line[0], "", None) )
return inventory
def inventory_nvidia_smi_temp(info):
inventory = []
for line in info:
if line[7] != 'N/A':
inventory.append( ("GPU"+line[0], "", None) )
return inventory
def inventory_nvidia_smi_power(info):
inventory = []
for line in info:
if line[8] != 'N/A' and line[9] != "N/A":
inventory.append( ("GPU"+line[0], "", None) )
return inventory
# the check functions
def check_nvidia_smi_fan(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[2])
perfdata = [('fan', value, 90, 95, 0, 100 )]
if value > 95:
return (2, "CRITICAL - %s fan speed is %d%%" % (line[1], value), perfdata)
elif value > 90:
return (1, "WARNING - %s fan speed is %d%%" % (line[1], value), perfdata)
else:
return (0, "OK - %s fan speed is %d%%" % (line[1], value), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)
def check_nvidia_smi_gpuutil(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[3])
perfdata = [('gpuutil', value, 100, 100, 0, 100 )]
return (0, "OK - %s utilization is %s%%" % (line[1], value), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)
def check_nvidia_smi_memutil(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[4])
perfdata = [('memutil', value, 100, 100, 0, 100 )]
if value > 95:
return (2, "CRITICAL - %s memory utilization is %d%%" % (line[1], value), perfdata)
elif value > 90:
return (1, "WARNING - %s memory utilization is %d%%" % (line[1], value), perfdata)
else:
return (0, "OK - %s memory utilization is %d%%" % (line[1], value), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)
def check_nvidia_smi_errors1(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[5])
if value > 500:
return (2, "CRITICAL - %s single bit error counter is %d" % (line[1], value))
if value > 100:
return (1, "WARNING - %s single bit error counter is %d" % (line[1], value))
else:
return (0, "OK - %s single bit error counter is %d" % (line[1], value))
return (3, "UNKNOWN - GPU %s not found in agent output" % item)
def check_nvidia_smi_errors2(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[6])
if value > 500:
return (2, "CRITICAL - %s double bit error counter is %d" % (line[1], value))
if value > 100:
return (1, "WARNING - %s double bit error counter is %d" % (line[1], value))
else:
return (0, "OK - %s double bit error counter is %d" % (line[1], value))
return (3, "UNKNOWN - GPU %s not found in agent output" % item)
def check_nvidia_smi_temp(item, params, info):
for line in info:
if "GPU"+line[0] == item:
value = int(line[7])
perfdata = [('temp', value, 80, 90, 0, 95 )]
if value > 90:
return (2, "CRITICAL - %s temperature is %d°C" % (line[1], value), perfdata)
elif value > 80:
return (1, "WARNING - %s temperature is %d°C" % (line[1], value), perfdata)
else:
return (0, "OK - %s temperature is %d°C" % (line[1], value), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)
def check_nvidia_smi_power(item, params, info):
for line in info:
if "GPU"+line[0] == item:
draw = float(line[8])
limit = float(line[9])
value = draw * 100.0 / limit
perfdata = [('power', draw, limit * 0.8, limit * 0.9, 0, limit )]
if value > 90:
return (2, "CRITICAL - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata)
elif value > 80:
return (1, "WARNING - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata)
else:
return (0, "OK - %s power utilization is %d%% of %dW" % (line[1], value, limit), perfdata)
return (3, "UNKNOWN - GPU %s not found in agent output" % item)
# declare the check to Check_MK
check_info['nvidia_smi.fan'] = {
"check_function" : check_nvidia_smi_fan,
"inventory_function" : inventory_nvidia_smi_fan,
"service_description" : "%s fan speed",
"has_perfdata" : True,
"group" : "nvidia_smi"
}
check_info['nvidia_smi.gpuutil'] = {
"check_function" : check_nvidia_smi_gpuutil,
"inventory_function" : inventory_nvidia_smi_gpuutil,
"service_description" : "%s utilization",
"has_perfdata" : True,
"group" : "nvidia_smi"
}
check_info['nvidia_smi.memutil'] = {
"check_function" : check_nvidia_smi_memutil,
"inventory_function" : inventory_nvidia_smi_memutil,
"service_description" : "%s memory",
"has_perfdata" : True,
"group" : "nvidia_smi"
}
check_info['nvidia_smi.temp'] = {
"check_function" : check_nvidia_smi_temp,
"inventory_function" : inventory_nvidia_smi_temp,
"service_description" : "%s temperature",
"has_perfdata" : True,
"group" : "nvidia_smi"
}
check_info['nvidia_smi.power'] = {
"check_function" : check_nvidia_smi_power,
"inventory_function" : inventory_nvidia_smi_power,
"service_description" : "%s power",
"has_perfdata" : True,
"group" : "nvidia_smi"
}