func svmbasic(size cl.CL_size_t, context cl.CL_context, queue cl.CL_command_queue, kernel cl.CL_kernel) { // Prepare input data as follows. // Build two arrays: // - an array that consists of the Element structures // (refer to svmbasic.h for the structure definition) // - an array that consists of the float values // // Each structure of the first array has the following pointers: // - 'internal', which points to a 'value' field of another entry // of the same array. // - 'external', which points to a float value from the the // second array. // // Pointers are set randomly. The structures do not reflect any real usage // scenario, but are illustrative for a simple device-side traversal. // // Array of Element Array of floats // structures // // ||====================|| // || ............. || ||============|| // || ............. ||<-----+ || .......... || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value <----||------+ || .......... || // ||====================|| | || .......... || // || ............. || | || float || // || ............. || | || float || // ||====================|| | || float || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value || || float || // ||====================|| || float || // || ............. || || .......... || // || ............. || ||============|| // ||====================|| // // The two arrays are created independently and are used to illustrate // two new OpenCL 2.0 API functions: // - the array of Element structures is passed to the kernel as a // kernel argument with the clSetKernelArgSVMPointer function // - the array of floats is used by the kernel indirectly, and this // dependency should be also specified with the clSetKernelExecInfo // function prior to the kernel execution var err cl.CL_int // To enable host & device code to share pointer to the same address space // the arrays should be allocated as SVM memory. Use the clSVMAlloc function // to allocate SVM memory. // // Optionally, this function allows specifying alignment in bytes as its // last argument. As this basic example doesn't require any _special_ alignment, // the following code illustrates requesting default alignment via passing // zero value. inputElements := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER, size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputElements { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputElements) inputFloats := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputFloats { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputFloats) // The OpenCL kernel uses the aforementioned input arrays to compute // values for the output array. output := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_WRITE_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) defer cl.CLSVMFree(context, output) if nil == output { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } // Note: in the coarse-grained SVM, mapping of inputElement and inputFloats is // needed to do the following initialization. While here, in the fine-grained SVM, // it is not necessary. // Populate data-structures with initial data. r := rand.New(rand.NewSource(99)) for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) inputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(i)*unsafe.Sizeof(sampleFloat))) randElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleElement))) randFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleFloat))) inputElement.internal = &(randElement.value) inputElement.external = randFloat inputElement.value = cl.CL_float(i) *inputFloat = cl.CL_float(i + size) } // Note: in the coarse-grained SVM, unmapping of inputElement and inputFloats is // needed before scheduling the kernel for execution. While here, in the fine-grained SVM, // it is not necessary. // Pass arguments to the kernel. // According to the OpenCL 2.0 specification, you need to use a special // function to pass a pointer from SVM memory to kernel. err = cl.CLSetKernelArgSVMPointer(kernel, 0, inputElements) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") err = cl.CLSetKernelArgSVMPointer(kernel, 1, output) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") // For buffer based SVM (both coarse- and fine-grain) if one SVM buffer // points to memory allocated in another SVM buffer, such allocations // should be passed to the kernel via clSetKernelExecInfo. err = cl.CLSetKernelExecInfo(kernel, cl.CL_KERNEL_EXEC_INFO_SVM_PTRS, cl.CL_size_t(unsafe.Sizeof(inputFloats)), unsafe.Pointer(&inputFloats)) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelExecInfo") // Run the kernel. println("Running kernel...") var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = size err = cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, globalWorkSize[:], nil, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel") // Note: In the fine-grained SVM, after enqueuing the kernel above, the host application is // not blocked from accessing SVM allocations that were passed to the kernel. The host // can access the same regions of SVM memory as does the kernel if the kernel and the host // read/modify different bytes. If one side (host or device) needs to modify the same bytes // that are simultaniously read/modified by another side, atomics operations are usually // required to maintain sufficient memory consistency. This sample doesn't use this possibility // and the host just waits in clFinish below until the kernel is finished. err = cl.CLFinish(queue) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLFinish") println(" DONE.") // Validate output state for correctness. // Compare: in the coarse-grained SVM case you need to map the output. // Here it is not needed. println("Checking correctness of the output buffer...") for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) outputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(output) + uintptr(i)*unsafe.Sizeof(sampleFloat))) expectedValue := *(inputElement.internal) + *(inputElement.external) if *outputFloat != expectedValue { println(" FAILED.") fmt.Printf("Mismatch at position %d, read %f, expected %f\n", i, *outputFloat, expectedValue) return } } println(" PASSED.") }
func svmbasic(size cl.CL_size_t, context cl.CL_context, queue cl.CL_command_queue, kernel cl.CL_kernel) { // Prepare input data as follows. // Build two arrays: // - an array that consists of the Element structures // (refer to svmbasic.h for the structure definition) // - an array that consists of the float values // // Each structure of the first array has the following pointers: // - 'internal', which points to a 'value' field of another entry // of the same array. // - 'external', which points to a float value from the the // second array. // // Pointers are set randomly. The structures do not reflect any real usage // scenario, but are illustrative for a simple device-side traversal. // // Array of Element Array of floats // structures // // ||====================|| // || ............. || ||============|| // || ............. ||<-----+ || .......... || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value <----||------+ || .......... || // ||====================|| | || .......... || // || ............. || | || float || // || ............. || | || float || // ||====================|| | || float || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value || || float || // ||====================|| || float || // || ............. || || .......... || // || ............. || ||============|| // ||====================|| // // The two arrays are created independently and are used to illustrate // two new OpenCL 2.0 API functions: // - the array of Element structures is passed to the kernel as a // kernel argument with the clSetKernelArgSVMPointer function // - the array of floats is used by the kernel indirectly, and this // dependency should be also specified with the clSetKernelExecInfo // function prior to the kernel execution var err cl.CL_int // To enable host & device code to share pointer to the same address space // the arrays should be allocated as SVM memory. Use the clSVMAlloc function // to allocate SVM memory. // // Optionally, this function allows specifying alignment in bytes as its // last argument. As this basic example doesn't require any _special_ alignment, // the following code illustrates requesting default alignment via passing // zero value. inputElements := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY, size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputElements { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputElements) inputFloats := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputFloats { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputFloats) // The OpenCL kernel uses the aforementioned input arrays to compute // values for the output array. output := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_WRITE_ONLY, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) defer cl.CLSVMFree(context, output) if nil == output { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } // In the coarse-grained buffer SVM model, only one OpenCL device (or // host) can have ownership for writing to the buffer. Specifically, host // explicitly requests the ownership by mapping/unmapping the SVM buffer. // // So to fill the input SVM buffers on the host, you need to map them to have // access from the host program. // // The following two map calls are required in case of coarse-grained SVM only. err = cl.CLEnqueueSVMMap(queue, cl.CL_TRUE, // blocking map cl.CL_MAP_WRITE, inputElements, size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap") err = cl.CLEnqueueSVMMap(queue, cl.CL_TRUE, // blocking map cl.CL_MAP_WRITE, inputFloats, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap") // Populate data-structures with initial data. r := rand.New(rand.NewSource(99)) for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) inputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(i)*unsafe.Sizeof(sampleFloat))) randElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleElement))) randFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleFloat))) inputElement.internal = &(randElement.value) inputElement.external = randFloat inputElement.value = cl.CL_float(i) *inputFloat = cl.CL_float(i + size) } // The following two unmap calls are required in case of coarse-grained SVM only err = cl.CLEnqueueSVMUnmap(queue, inputElements, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap") err = cl.CLEnqueueSVMUnmap(queue, inputFloats, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap") // Pass arguments to the kernel. // According to the OpenCL 2.0 specification, you need to use a special // function to pass a pointer from SVM memory to kernel. err = cl.CLSetKernelArgSVMPointer(kernel, 0, inputElements) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") err = cl.CLSetKernelArgSVMPointer(kernel, 1, output) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") // For buffer based SVM (both coarse- and fine-grain) if one SVM buffer // points to memory allocated in another SVM buffer, such allocations // should be passed to the kernel via clSetKernelExecInfo. err = cl.CLSetKernelExecInfo(kernel, cl.CL_KERNEL_EXEC_INFO_SVM_PTRS, cl.CL_size_t(unsafe.Sizeof(inputFloats)), unsafe.Pointer(&inputFloats)) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelExecInfo") // Run the kernel. println("Running kernel...") var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = size err = cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, globalWorkSize[:], nil, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel") // Map the output SVM buffer to read the results. // Mapping is required for coarse-grained SVM only. err = cl.CLEnqueueSVMMap(queue, cl.CL_TRUE, // blocking map cl.CL_MAP_READ, output, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap") println(" DONE.") // Validate output state for correctness. println("Checking correctness of the output buffer...") for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) outputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(output) + uintptr(i)*unsafe.Sizeof(sampleFloat))) expectedValue := *(inputElement.internal) + *(inputElement.external) if *outputFloat != expectedValue { println(" FAILED.") fmt.Printf("Mismatch at position %d, read %f, expected %f\n", i, *outputFloat, expectedValue) return } } println(" PASSED.") err = cl.CLEnqueueSVMUnmap(queue, output, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap") err = cl.CLFinish(queue) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLFinish") }