#!/usr/bin/python3 # -*- encoding: utf-8; py-indent-offset: 4 -*- # +------------------------------------------------------------------+ # | ____ _ _ __ __ _ __ | # | / ___| |__ ___ ___| | __ | \/ | |/ / | # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / | # | | |___| | | | __/ (__| < | | | | . \ | # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ | # | | # | Copyright Mathias Kettner 2012 mk@mathias-kettner.de | # +------------------------------------------------------------------+ # # This file is part of Check_MK. # The official homepage is at http://mathias-kettner.de/check_mk. # # check_mk is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation in version 2. check_mk is distributed # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with- # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. See the GNU General Public License for more de- # ails. You should have received a copy of the GNU General Public # License along with GNU Make; see the file COPYING. If not, write # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, # Boston, MA 02110-1301 USA. ####################################### # Script developed by S M Raju ####################################### # Update by KuhnC/SAL # 10.11.2023: Update to report more values (version, ...) # 10.11.2023: Switch for different nvidia-smi version (fanspeed, power src) # 14.11.2023: Another nvidia-smi version switch (this time ecc) # 14.11.2023: GRID Karten haben keinen Stromverbrauch... from subprocess import Popen, PIPE import xml.dom.minidom from functools import reduce print("<<>>") i = 0 nvidiaSMI = xml.dom.minidom.parseString(Popen(["nvidia-smi", "-q", "-x"], stdout=PIPE).communicate()[0]) driver = nvidiaSMI.getElementsByTagName('driver_version')[0].childNodes[0].data print(f"{driver}") for gpu in nvidiaSMI.getElementsByTagName('gpu'): gpu_id = int(gpu .getElementsByTagName("minor_number")[0] .childNodes[0].data.split()[0]) names = list(gpu .getElementsByTagName("product_name")[0] .childNodes[0].data.split()) gpu_name = reduce(lambda x,y:x+"-"+y,names) try: fan_speed = int(gpu .getElementsByTagName("fan_speed")[0] .childNodes[0].data.split()[0]) except ValueError: fan_speed: int = 0 gpu_utilization = int(gpu .getElementsByTagName("utilization")[0] .getElementsByTagName("gpu_util")[0] .childNodes[0].data.split()[0]) try: mem_utilization_node = gpu.getElementsByTagName("fb_memory_usage")[0] mem_used = int(mem_utilization_node.getElementsByTagName("used")[0].childNodes[0].data.split()[0]) mem_total = int(mem_utilization_node.getElementsByTagName("total")[0].childNodes[0].data.split()[0]) mem_utilization: float = round((mem_used / mem_total) * 100, 2) except ValueError: mem_utilization: float = 0 try: ecc_errors_1 = int(gpu .getElementsByTagName("ecc_errors")[0] .getElementsByTagName("aggregate")[0] .getElementsByTagName("sram_uncorrectable")[0] .childNodes[0].data) except ValueError: ecc_errors_1 = "0" except IndexError: ecc_errors_1 = "0" try: ecc_errors_2 = int(gpu .getElementsByTagName("ecc_errors")[0] .getElementsByTagName("aggregate")[0] .getElementsByTagName("dram_uncorrectable")[0] .childNodes[0].data) except ValueError: ecc_errors_2 = "0" except IndexError: ecc_errors_2 = "0" try: temperature = int(gpu .getElementsByTagName("temperature")[0] .getElementsByTagName("gpu_temp")[0] .childNodes[0].data.split()[0]) except ValueError: temperature: int = 0 power_source = gpu.getElementsByTagName("power_readings") try: if len(power_source) < 1: power_source = gpu.getElementsByTagName("gpu_power_readings") power_limit = float(power_source[0] .getElementsByTagName("current_power_limit")[0] .childNodes[0] .data.split()[0]) else: power_limit = float(power_source[0] .getElementsByTagName("power_limit")[0] .childNodes[0].data.split()[0]) except ValueError: power_limit = 0 try: power_draw = float(power_source[0] .getElementsByTagName("power_draw")[0] .childNodes[0].data.split()[0]) except ValueError: power_draw = 0 vbios = gpu.getElementsByTagName("vbios_version")[0].childNodes[0].data processes = len([i for i in gpu.getElementsByTagName("processes")[0].childNodes if i.nodeType == xml.dom.minidom.Node.ELEMENT_NODE]) print(f"{gpu_id} {gpu_name} {fan_speed} {gpu_utilization} {mem_utilization} {ecc_errors_1} {ecc_errors_2} {temperature} {power_draw} {power_limit} {processes} {vbios}") i +=1