Example #1
0
func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
	var bar1 C.nvmlBAR1Memory_t

	r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
	if r == C.NVML_ERROR_NOT_SUPPORTED {
		return nil, nil, nil
	}
	return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
}
Example #2
0
func NewDevice(idx uint) (device *Device, err error) {
	var (
		dev   C.nvmlDevice_t
		model [szModel]C.char
		uuid  [szUUID]C.char
		pci   C.nvmlPciInfo_t
		minor C.uint
		bar1  C.nvmlBAR1Memory_t
		power C.uint
		clock [2]C.uint
		pciel [2]C.uint
		mask  cpuMask
	)

	defer func() {
		if r := recover(); r != nil {
			err = r.(error)
		}
	}()

	assert(C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev))
	assert(C.nvmlDeviceGetName(dev, &model[0], szModel))
	assert(C.nvmlDeviceGetUUID(dev, &uuid[0], szUUID))
	assert(C.nvmlDeviceGetPciInfo(dev, &pci))
	assert(C.nvmlDeviceGetMinorNumber(dev, &minor))
	assert(C.nvmlDeviceGetBAR1MemoryInfo(dev, &bar1))
	assert(C.nvmlDeviceGetPowerManagementLimit(dev, &power))
	assert(C.nvmlDeviceGetMaxClockInfo(dev, C.NVML_CLOCK_SM, &clock[0]))
	assert(C.nvmlDeviceGetMaxClockInfo(dev, C.NVML_CLOCK_MEM, &clock[1]))
	assert(C.nvmlDeviceGetMaxPcieLinkGeneration(dev, &pciel[0]))
	assert(C.nvmlDeviceGetMaxPcieLinkWidth(dev, &pciel[1]))
	assert(C.nvmlDeviceGetCpuAffinity(dev, C.uint(len(mask)), (*C.ulong)(&mask[0])))
	cpu, err := mask.cpuNode()
	if err != nil {
		return nil, err
	}

	device = &Device{
		handle:      dev,
		Model:       C.GoString(&model[0]),
		UUID:        C.GoString(&uuid[0]),
		Path:        fmt.Sprintf("/dev/nvidia%d", uint(minor)),
		Power:       uint(power / 1000),
		CPUAffinity: cpu,
		PCI: PCIInfo{
			BusID:     C.GoString(&pci.busId[0]),
			BAR1:      uint64(bar1.bar1Total / (1024 * 1024)),
			Bandwidth: pcieGenToBandwidth[int(pciel[0])] * uint(pciel[1]) / 1000,
		},
		Clocks: ClockInfo{
			Core:   uint(clock[0]),
			Memory: uint(clock[1]),
		},
	}
	return
}
Example #3
0
func (d *Device) Status() (status *DeviceStatus, err error) {
	var (
		power      C.uint
		temp       C.uint
		usage      C.nvmlUtilization_t
		encoder    [2]C.uint
		decoder    [2]C.uint
		mem        C.nvmlMemory_t
		ecc        [3]C.ulonglong
		clock      [2]C.uint
		bar1       C.nvmlBAR1Memory_t
		throughput [2]C.uint
		procname   [szProcName]C.char
		procs      [szProcs]C.nvmlProcessInfo_t
		nprocs     = C.uint(szProcs)
	)

	defer func() {
		if r := recover(); r != nil {
			err = r.(error)
		}
	}()

	assert(C.nvmlDeviceGetPowerUsage(d.handle, &power))
	assert(C.nvmlDeviceGetTemperature(d.handle, C.NVML_TEMPERATURE_GPU, &temp))
	assert(C.nvmlDeviceGetUtilizationRates(d.handle, &usage))
	assert(C.nvmlDeviceGetEncoderUtilization(d.handle, &encoder[0], &encoder[1]))
	assert(C.nvmlDeviceGetDecoderUtilization(d.handle, &decoder[0], &decoder[1]))
	assert(C.nvmlDeviceGetMemoryInfo(d.handle, &mem))
	assert(C.nvmlDeviceGetClockInfo(d.handle, C.NVML_CLOCK_SM, &clock[0]))
	assert(C.nvmlDeviceGetClockInfo(d.handle, C.NVML_CLOCK_MEM, &clock[1]))
	assert(C.nvmlDeviceGetBAR1MemoryInfo(d.handle, &bar1))
	assert(C.nvmlDeviceGetComputeRunningProcesses(d.handle, &nprocs, &procs[0]))

	status = &DeviceStatus{
		Power:       uint(power / 1000),
		Temperature: uint(temp),
		Utilization: UtilizationInfo{
			GPU:     uint(usage.gpu),
			Encoder: uint(encoder[0]),
			Decoder: uint(decoder[0]),
		},
		Memory: MemoryInfo{
			GlobalUsed: uint64(mem.used / (1024 * 1024)),
		},
		Clocks: ClockInfo{
			Core:   uint(clock[0]),
			Memory: uint(clock[1]),
		},
		PCI: PCIStatusInfo{
			BAR1Used: uint64(bar1.bar1Used / (1024 * 1024)),
		},
	}

	r := C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC,
		C.NVML_MEMORY_LOCATION_L1_CACHE, &ecc[0])
	if r != C.NVML_ERROR_NOT_SUPPORTED { // only supported on Tesla cards
		assert(r)
		assert(C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC,
			C.NVML_MEMORY_LOCATION_L2_CACHE, &ecc[1]))
		assert(C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC,
			C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &ecc[2]))
		status.Memory.ECCErrors = ECCErrorsInfo{uint64(ecc[0]), uint64(ecc[1]), uint64(ecc[2])}
	}

	r = C.nvmlDeviceGetPcieThroughput(d.handle, C.NVML_PCIE_UTIL_RX_BYTES, &throughput[0])
	if r != C.NVML_ERROR_NOT_SUPPORTED { // only supported on Maxwell or newer
		assert(r)
		assert(C.nvmlDeviceGetPcieThroughput(d.handle, C.NVML_PCIE_UTIL_TX_BYTES, &throughput[1]))
		status.PCI.Throughput = PCIThroughputInfo{uint(throughput[0]), uint(throughput[1])}
	}

	status.Processes = make([]ProcessInfo, nprocs)
	for i := range status.Processes {
		status.Processes[i].PID = uint(procs[i].pid)
		assert(C.nvmlSystemGetProcessName(procs[i].pid, &procname[0], szProcName))
		status.Processes[i].Name = C.GoString(&procname[0])
	}
	return
}