func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) { var bar1 C.nvmlBAR1Memory_t r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1) if r == C.NVML_ERROR_NOT_SUPPORTED { return nil, nil, nil } return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r) }
func NewDevice(idx uint) (device *Device, err error) { var ( dev C.nvmlDevice_t model [szModel]C.char uuid [szUUID]C.char pci C.nvmlPciInfo_t minor C.uint bar1 C.nvmlBAR1Memory_t power C.uint clock [2]C.uint pciel [2]C.uint mask cpuMask ) defer func() { if r := recover(); r != nil { err = r.(error) } }() assert(C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)) assert(C.nvmlDeviceGetName(dev, &model[0], szModel)) assert(C.nvmlDeviceGetUUID(dev, &uuid[0], szUUID)) assert(C.nvmlDeviceGetPciInfo(dev, &pci)) assert(C.nvmlDeviceGetMinorNumber(dev, &minor)) assert(C.nvmlDeviceGetBAR1MemoryInfo(dev, &bar1)) assert(C.nvmlDeviceGetPowerManagementLimit(dev, &power)) assert(C.nvmlDeviceGetMaxClockInfo(dev, C.NVML_CLOCK_SM, &clock[0])) assert(C.nvmlDeviceGetMaxClockInfo(dev, C.NVML_CLOCK_MEM, &clock[1])) assert(C.nvmlDeviceGetMaxPcieLinkGeneration(dev, &pciel[0])) assert(C.nvmlDeviceGetMaxPcieLinkWidth(dev, &pciel[1])) assert(C.nvmlDeviceGetCpuAffinity(dev, C.uint(len(mask)), (*C.ulong)(&mask[0]))) cpu, err := mask.cpuNode() if err != nil { return nil, err } device = &Device{ handle: dev, Model: C.GoString(&model[0]), UUID: C.GoString(&uuid[0]), Path: fmt.Sprintf("/dev/nvidia%d", uint(minor)), Power: uint(power / 1000), CPUAffinity: cpu, PCI: PCIInfo{ BusID: C.GoString(&pci.busId[0]), BAR1: uint64(bar1.bar1Total / (1024 * 1024)), Bandwidth: pcieGenToBandwidth[int(pciel[0])] * uint(pciel[1]) / 1000, }, Clocks: ClockInfo{ Core: uint(clock[0]), Memory: uint(clock[1]), }, } return }
func (d *Device) Status() (status *DeviceStatus, err error) { var ( power C.uint temp C.uint usage C.nvmlUtilization_t encoder [2]C.uint decoder [2]C.uint mem C.nvmlMemory_t ecc [3]C.ulonglong clock [2]C.uint bar1 C.nvmlBAR1Memory_t throughput [2]C.uint procname [szProcName]C.char procs [szProcs]C.nvmlProcessInfo_t nprocs = C.uint(szProcs) ) defer func() { if r := recover(); r != nil { err = r.(error) } }() assert(C.nvmlDeviceGetPowerUsage(d.handle, &power)) assert(C.nvmlDeviceGetTemperature(d.handle, C.NVML_TEMPERATURE_GPU, &temp)) assert(C.nvmlDeviceGetUtilizationRates(d.handle, &usage)) assert(C.nvmlDeviceGetEncoderUtilization(d.handle, &encoder[0], &encoder[1])) assert(C.nvmlDeviceGetDecoderUtilization(d.handle, &decoder[0], &decoder[1])) assert(C.nvmlDeviceGetMemoryInfo(d.handle, &mem)) assert(C.nvmlDeviceGetClockInfo(d.handle, C.NVML_CLOCK_SM, &clock[0])) assert(C.nvmlDeviceGetClockInfo(d.handle, C.NVML_CLOCK_MEM, &clock[1])) assert(C.nvmlDeviceGetBAR1MemoryInfo(d.handle, &bar1)) assert(C.nvmlDeviceGetComputeRunningProcesses(d.handle, &nprocs, &procs[0])) status = &DeviceStatus{ Power: uint(power / 1000), Temperature: uint(temp), Utilization: UtilizationInfo{ GPU: uint(usage.gpu), Encoder: uint(encoder[0]), Decoder: uint(decoder[0]), }, Memory: MemoryInfo{ GlobalUsed: uint64(mem.used / (1024 * 1024)), }, Clocks: ClockInfo{ Core: uint(clock[0]), Memory: uint(clock[1]), }, PCI: PCIStatusInfo{ BAR1Used: uint64(bar1.bar1Used / (1024 * 1024)), }, } r := C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &ecc[0]) if r != C.NVML_ERROR_NOT_SUPPORTED { // only supported on Tesla cards assert(r) assert(C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &ecc[1])) assert(C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &ecc[2])) status.Memory.ECCErrors = ECCErrorsInfo{uint64(ecc[0]), uint64(ecc[1]), uint64(ecc[2])} } r = C.nvmlDeviceGetPcieThroughput(d.handle, C.NVML_PCIE_UTIL_RX_BYTES, &throughput[0]) if r != C.NVML_ERROR_NOT_SUPPORTED { // only supported on Maxwell or newer assert(r) assert(C.nvmlDeviceGetPcieThroughput(d.handle, C.NVML_PCIE_UTIL_TX_BYTES, &throughput[1])) status.PCI.Throughput = PCIThroughputInfo{uint(throughput[0]), uint(throughput[1])} } status.Processes = make([]ProcessInfo, nprocs) for i := range status.Processes { status.Processes[i].PID = uint(procs[i].pid) assert(C.nvmlSystemGetProcessName(procs[i].pid, &procname[0], szProcName)) status.Processes[i].Name = C.GoString(&procname[0]) } return }