예제 #1
0
func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
	var sm, mem C.uint

	r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
	if r == C.NVML_ERROR_NOT_SUPPORTED {
		return nil, nil, nil
	}
	if r == C.NVML_SUCCESS {
		r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
	}
	return uintPtr(sm), uintPtr(mem), errorString(r)
}
예제 #2
0
func (d *Device) Status() (status *DeviceStatus, err error) {
	var (
		power      C.uint
		temp       C.uint
		usage      C.nvmlUtilization_t
		encoder    [2]C.uint
		decoder    [2]C.uint
		mem        C.nvmlMemory_t
		ecc        [3]C.ulonglong
		clock      [2]C.uint
		bar1       C.nvmlBAR1Memory_t
		throughput [2]C.uint
		procname   [szProcName]C.char
		procs      [szProcs]C.nvmlProcessInfo_t
		nprocs     = C.uint(szProcs)
	)

	defer func() {
		if r := recover(); r != nil {
			err = r.(error)
		}
	}()

	assert(C.nvmlDeviceGetPowerUsage(d.handle, &power))
	assert(C.nvmlDeviceGetTemperature(d.handle, C.NVML_TEMPERATURE_GPU, &temp))
	assert(C.nvmlDeviceGetUtilizationRates(d.handle, &usage))
	assert(C.nvmlDeviceGetEncoderUtilization(d.handle, &encoder[0], &encoder[1]))
	assert(C.nvmlDeviceGetDecoderUtilization(d.handle, &decoder[0], &decoder[1]))
	assert(C.nvmlDeviceGetMemoryInfo(d.handle, &mem))
	assert(C.nvmlDeviceGetClockInfo(d.handle, C.NVML_CLOCK_SM, &clock[0]))
	assert(C.nvmlDeviceGetClockInfo(d.handle, C.NVML_CLOCK_MEM, &clock[1]))
	assert(C.nvmlDeviceGetBAR1MemoryInfo(d.handle, &bar1))
	assert(C.nvmlDeviceGetComputeRunningProcesses(d.handle, &nprocs, &procs[0]))

	status = &DeviceStatus{
		Power:       uint(power / 1000),
		Temperature: uint(temp),
		Utilization: UtilizationInfo{
			GPU:     uint(usage.gpu),
			Encoder: uint(encoder[0]),
			Decoder: uint(decoder[0]),
		},
		Memory: MemoryInfo{
			GlobalUsed: uint64(mem.used / (1024 * 1024)),
		},
		Clocks: ClockInfo{
			Core:   uint(clock[0]),
			Memory: uint(clock[1]),
		},
		PCI: PCIStatusInfo{
			BAR1Used: uint64(bar1.bar1Used / (1024 * 1024)),
		},
	}

	r := C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC,
		C.NVML_MEMORY_LOCATION_L1_CACHE, &ecc[0])
	if r != C.NVML_ERROR_NOT_SUPPORTED { // only supported on Tesla cards
		assert(r)
		assert(C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC,
			C.NVML_MEMORY_LOCATION_L2_CACHE, &ecc[1]))
		assert(C.nvmlDeviceGetMemoryErrorCounter(d.handle, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, C.NVML_VOLATILE_ECC,
			C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &ecc[2]))
		status.Memory.ECCErrors = ECCErrorsInfo{uint64(ecc[0]), uint64(ecc[1]), uint64(ecc[2])}
	}

	r = C.nvmlDeviceGetPcieThroughput(d.handle, C.NVML_PCIE_UTIL_RX_BYTES, &throughput[0])
	if r != C.NVML_ERROR_NOT_SUPPORTED { // only supported on Maxwell or newer
		assert(r)
		assert(C.nvmlDeviceGetPcieThroughput(d.handle, C.NVML_PCIE_UTIL_TX_BYTES, &throughput[1]))
		status.PCI.Throughput = PCIThroughputInfo{uint(throughput[0]), uint(throughput[1])}
	}

	status.Processes = make([]ProcessInfo, nprocs)
	for i := range status.Processes {
		status.Processes[i].PID = uint(procs[i].pid)
		assert(C.nvmlSystemGetProcessName(procs[i].pid, &procname[0], szProcName))
		status.Processes[i].Name = C.GoString(&procname[0])
	}
	return
}