func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) { var procs [szProcs]C.nvmlProcessInfo_t var count = C.uint(szProcs) r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0]) if r == C.NVML_ERROR_NOT_SUPPORTED { return nil, nil, nil } n := int(count) pids := make([]uint, n) mems := make([]uint64, n) for i := 0; i < n; i++ { pids[i] = uint(procs[i].pid) mems[i] = uint64(procs[i].usedGpuMemory) } return pids, mems, errorString(r) }
func (d *Device) Status() (status *DeviceStatus, err error) { var ( power C.uint temp C.uint usage C.nvmlUtilization_t encoder [2]C.uint decoder [2]C.uint mem C.nvmlMemory_t ecc [3]C.ulonglong clock [2]C.uint bar1 C.nvmlBAR1Memory_t throughput [2]C.uint procname [szProcName]C.char procs [szProcs]C.nvmlProcessInfo_t nprocs = C.uint(szProcs) ) defer func() { if r := recover(); r != nil { err = r.(error) } }() assert(C.nvmlDeviceGetPowerUsage(d.handle, &power)) assert(C.nvmlDeviceGetTemperature(d.handle, C.NVML_TEMPERATURE_GPU, &temp)) assert(C.nvmlDeviceGetUtilizationRates(d.handle, &usage)) assert(C.nvmlDeviceGetEncoderUtilization(d.handle, &encoder[0], &encoder[1])) assert(C.nvmlDeviceGetDecoderUtilization(d.handle, &decoder[0], &decoder[1])) assert(C.nvmlDeviceGetMemoryInfo(d.handle, &mem)) assert(C.nvmlDeviceGetClockInfo(d.handle, C.NVML_CLOCK_SM, &clock[0])) assert(C.nvmlDeviceGetClockInfo(d.handle, C.NVML_CLOCK_MEM, &clock[1])) assert(C.nvmlDeviceGetBAR1MemoryInfo(d.handle, &bar1)) assert(C.nvmlDeviceGetComputeRunningProcesses(d.handle, &nprocs, &procs[0])) status = &DeviceStatus{ Power: uint(power / 1000), Temperature: uint(temp), Utilization: UtilizationInfo{ GPU: uint(usage.gpu), Encoder: uint(encoder[0]), Decoder: uint(decoder[0]), }, Memory: MemoryInfo{ GlobalUsed: uint64(mem.used / (1024 * 1024)), }, Clocks: ClockInfo{ Core: uint(clock[0]), Memory: uint(clock[1]), }, PCI: PCIStatusInfo{ BAR1Used: uint64(bar1.bar1Used / (1024 * 1024)), }, } r := C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &ecc[0]) if r != C.NVML_ERROR_NOT_SUPPORTED { // only supported on Tesla cards assert(r) assert(C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &ecc[1])) assert(C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &ecc[2])) status.Memory.ECCErrors = ECCErrorsInfo{uint64(ecc[0]), uint64(ecc[1]), uint64(ecc[2])} } r = C.nvmlDeviceGetPcieThroughput(d.handle, C.NVML_PCIE_UTIL_RX_BYTES, &throughput[0]) if r != C.NVML_ERROR_NOT_SUPPORTED { // only supported on Maxwell or newer assert(r) assert(C.nvmlDeviceGetPcieThroughput(d.handle, C.NVML_PCIE_UTIL_TX_BYTES, &throughput[1])) status.PCI.Throughput = PCIThroughputInfo{uint(throughput[0]), uint(throughput[1])} } status.Processes = make([]ProcessInfo, nprocs) for i := range status.Processes { status.Processes[i].PID = uint(procs[i].pid) assert(C.nvmlSystemGetProcessName(procs[i].pid, &procname[0], szProcName)) status.Processes[i].Name = C.GoString(&procname[0]) } return }