func main() { // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint var platforms []cl.CL_platform_id // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) // Allocate enough space for each platform platforms = make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") //----------------------------------------------------- // STEP 2: Discover and initialize the GPU devices //----------------------------------------------------- var numDevices cl.CL_uint var devices []cl.CL_device_id // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") // Allocate enough space for each device devices = make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, numDevices, devices, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- var context cl.CL_context // Create a context using clCreateContext() and // associate it with the devices context = cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext") defer cl.CLReleaseContext(context) //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- var commandQueue [MAX_COMMAND_QUEUE]cl.CL_command_queue // Create a command queue using clCreateCommandQueueWithProperties(), // and associate it with the device you want to execute for i := 0; i < MAX_COMMAND_QUEUE; i++ { commandQueue[i] = cl.CLCreateCommandQueueWithProperties(context, devices[0], nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties") defer cl.CLReleaseCommandQueue(commandQueue[i]) } //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- producerGroupSize := cl.CL_size_t(PRODUCER_GROUP_SIZE) producerGlobalSize := cl.CL_size_t(PRODUCER_GLOBAL_SIZE) consumerGroupSize := cl.CL_size_t(CONSUMER_GROUP_SIZE) consumerGlobalSize := cl.CL_size_t(CONSUMER_GLOBAL_SIZE) var samplePipePkt [2]cl.CL_float szPipe := cl.CL_uint(PIPE_SIZE) szPipePkt := cl.CL_uint(unsafe.Sizeof(samplePipePkt)) if szPipe%PRNG_CHANNELS != 0 { szPipe = (szPipe/PRNG_CHANNELS)*PRNG_CHANNELS + PRNG_CHANNELS } consumerGlobalSize = cl.CL_size_t(szPipe) pipePktPerThread := cl.CL_int(szPipe) / PRNG_CHANNELS seed := cl.CL_int(SEED) rngType := cl.CL_int(RV_GAUSSIAN) var histMin cl.CL_float var histMax cl.CL_float if rngType == cl.CL_int(RV_UNIFORM) { histMin = 0.0 histMax = 1.0 } else { histMin = -10.0 histMax = 10.0 } localDevHist := make([]cl.CL_int, MAX_HIST_BINS) cpuHist := make([]cl.CL_int, MAX_HIST_BINS) //Create and initialize memory objects rngPipe := cl.CLCreatePipe(context, cl.CL_MEM_READ_WRITE, szPipePkt, szPipe, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreatePipe") devHist := cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE|cl.CL_MEM_COPY_HOST_PTR, MAX_HIST_BINS*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])), unsafe.Pointer(&localDevHist[0]), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreateBuffer") //----------------------------------------------------- // STEP 6: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("pipe.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource") defer cl.CLReleaseProgram(program) // Build (compile) the program for the devices with // clBuildProgram() options := "-cl-std=CL2.0" status = cl.CLBuildProgram(program, numDevices, devices, []byte(options), nil, nil) if status != cl.CL_SUCCESS { var program_log interface{} var log_size cl.CL_size_t /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) fmt.Printf("%s\n", program_log) return } //utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram") //----------------------------------------------------- // STEP 7: Create the kernel //----------------------------------------------------- // Use clCreateKernel() to create a kernel produceKernel := cl.CLCreateKernel(program, []byte("pipe_producer"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(produceKernel) consumeKernel := cl.CLCreateKernel(program, []byte("pipe_consumer"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(consumeKernel) //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArg(produceKernel, 0, cl.CL_size_t(unsafe.Sizeof(rngPipe)), unsafe.Pointer(&rngPipe)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)") status = cl.CLSetKernelArg(produceKernel, 1, cl.CL_size_t(unsafe.Sizeof(pipePktPerThread)), unsafe.Pointer(&pipePktPerThread)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(pipePktPerThread)") status = cl.CLSetKernelArg(produceKernel, 2, cl.CL_size_t(unsafe.Sizeof(seed)), unsafe.Pointer(&seed)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(seed)") status = cl.CLSetKernelArg(produceKernel, 3, cl.CL_size_t(unsafe.Sizeof(rngType)), unsafe.Pointer(&rngType)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngType)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. // Enqueue both the kernels. var globalThreads = []cl.CL_size_t{producerGlobalSize} var localThreads = []cl.CL_size_t{producerGroupSize} //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items var produceEvt [1]cl.CL_event status = cl.CLEnqueueNDRangeKernel(commandQueue[0], produceKernel, 1, nil, globalThreads, localThreads, 0, nil, &produceEvt[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel") /* launch consumer kernel only after producer has finished. This is done to avoid concurrent kernels execution as the memory consistency of pipe is guaranteed only across synchronization points. */ status = cl.CLWaitForEvents(1, produceEvt[:]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clWaitForEvents(produceEvt)") //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArg(consumeKernel, 0, cl.CL_size_t(unsafe.Sizeof(rngPipe)), unsafe.Pointer(&rngPipe)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)") status = cl.CLSetKernelArg(consumeKernel, 1, cl.CL_size_t(unsafe.Sizeof(devHist)), unsafe.Pointer(&devHist)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(devHist)") status = cl.CLSetKernelArg(consumeKernel, 2, cl.CL_size_t(unsafe.Sizeof(histMin)), unsafe.Pointer(&histMin)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMin)") status = cl.CLSetKernelArg(consumeKernel, 3, cl.CL_size_t(unsafe.Sizeof(histMax)), unsafe.Pointer(&histMax)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMax)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. globalThreads[0] = consumerGlobalSize localThreads[0] = consumerGroupSize //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items var consumeEvt [1]cl.CL_event status = cl.CLEnqueueNDRangeKernel( commandQueue[1], consumeKernel, 1, nil, globalThreads, localThreads, 0, nil, &consumeEvt[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel") status = cl.CLFlush(commandQueue[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(0)") status = cl.CLFlush(commandQueue[1]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(1)") //wait for kernels to finish status = cl.CLFinish(commandQueue[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(0)") status = cl.CLFinish(commandQueue[1]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(1)") //----------------------------------------------------- // STEP 11: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) //copy the data back to host buffer var readEvt cl.CL_event status = cl.CLEnqueueReadBuffer(commandQueue[1], devHist, cl.CL_TRUE, 0, (MAX_HIST_BINS)*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])), unsafe.Pointer(&localDevHist[0]), 0, nil, &readEvt) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueReadBuffer") //----------------------------------------------------- // STEP 12: Verify the results //----------------------------------------------------- //Find the tolerance limit fTol := (float32)(CONSUMER_GLOBAL_SIZE) * (float32)(COMP_TOL) / (float32)(100.0) iTol := (int)(fTol) if iTol == 0 { iTol = 1 } //CPU side histogram computation CPUReference(seed, pipePktPerThread, rngType, cpuHist, histMax, histMin) //Compare for bin := 0; bin < MAX_HIST_BINS; bin++ { diff := int(localDevHist[bin] - cpuHist[bin]) if diff < 0 { diff = -diff } if diff > iTol { println("Failed!") return } } println("Passed!") }
func main() { // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint var platforms []cl.CL_platform_id // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) // Allocate enough space for each platform platforms = make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") //----------------------------------------------------- // STEP 2: Discover and initialize the GPU devices //----------------------------------------------------- var numDevices cl.CL_uint var devices []cl.CL_device_id // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") // Allocate enough space for each device devices = make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, numDevices, devices, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- var context cl.CL_context // Create a context using clCreateContext() and // associate it with the devices context = cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext") defer cl.CLReleaseContext(context) //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- var cmdQueue cl.CL_command_queue // Create a command queue using clCreateCommandQueueWithProperties(), // and associate it with the device you want to execute cmdQueue = cl.CLCreateCommandQueueWithProperties(context, devices[0], nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties") defer cl.CLReleaseCommandQueue(cmdQueue) //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- // initialize any device/SVM memory here. /* svm buffer for binary tree */ svmTreeBuf := cl.CLSVMAlloc(context, cl.CL_MEM_READ_WRITE, cl.CL_size_t(NUMBER_OF_NODES*unsafe.Sizeof(sampleNode)), 0) if nil == svmTreeBuf { println("clSVMAlloc(svmTreeBuf) failed.") return } defer cl.CLSVMFree(context, svmTreeBuf) /* svm buffer for search keys */ svmSearchBuf := cl.CLSVMAlloc(context, cl.CL_MEM_READ_WRITE, cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)), 0) if nil == svmSearchBuf { println("clSVMAlloc(svmSearchBuf) failed.") return } defer cl.CLSVMFree(context, svmSearchBuf) //create the binary tree and set the root /* root node of the binary tree */ svmRoot := cpuCreateBinaryTree(cmdQueue, svmTreeBuf) //initialize search keys cpuInitSearchKeys(cmdQueue, svmSearchBuf) /* if voice is not deliberately muzzled, shout parameters */ fmt.Printf("-------------------------------------------------------------------------\n") fmt.Printf("Searching %d keys in a BST having %d Nodes...\n", NUMBER_OF_SEARCH_KEY, NUMBER_OF_NODES) fmt.Printf("-------------------------------------------------------------------------\n") //----------------------------------------------------- // STEP 6: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("bst.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource") defer cl.CLReleaseProgram(program) // Build (compile) the program for the devices with // clBuildProgram() options := "-cl-std=CL2.0" status = cl.CLBuildProgram(program, numDevices, devices, []byte(options), nil, nil) if status != cl.CL_SUCCESS { var program_log interface{} var log_size cl.CL_size_t /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) fmt.Printf("%s\n", program_log) return } //utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram") //----------------------------------------------------- // STEP 7: Create the kernel //----------------------------------------------------- var kernel cl.CL_kernel // Use clCreateKernel() to create a kernel kernel = cl.CLCreateKernel(program, []byte("bst_kernel"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(kernel) //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArgSVMPointer(kernel, 0, svmTreeBuf) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArgSVMPointer(svmTreeBuf)") status = cl.CLSetKernelArgSVMPointer(kernel, 1, svmSearchBuf) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArgSVMPointer(svmSearchBuf)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. var localWorkSize [1]cl.CL_size_t var kernelWorkGroupSize interface{} status = cl.CLGetKernelWorkGroupInfo(kernel, devices[0], cl.CL_KERNEL_WORK_GROUP_SIZE, cl.CL_size_t(unsafe.Sizeof(localWorkSize[0])), &kernelWorkGroupSize, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetKernelWorkGroupInfo") localWorkSize[0] = kernelWorkGroupSize.(cl.CL_size_t) var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = NUMBER_OF_SEARCH_KEY //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items status = cl.CLEnqueueNDRangeKernel(cmdQueue, kernel, 1, nil, globalWorkSize[:], localWorkSize[:], 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel") status = cl.CLFlush(cmdQueue) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush") status = cl.CLFinish(cmdQueue) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish") //----------------------------------------------------- // STEP 11: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) //copy the data back to host buffer //this demo doesn't need clEnqueueReadBuffer due to SVM //----------------------------------------------------- // STEP 12: Verify the results //----------------------------------------------------- // reference implementation svmBinaryTreeCPUReference(cmdQueue, svmRoot, svmTreeBuf, svmSearchBuf) // compare the results and see if they match pass := svmCompareResults(cmdQueue, svmSearchBuf) if pass { println("Passed!") } else { println("Failed!") } }
func svmbasic(size cl.CL_size_t, context cl.CL_context, queue cl.CL_command_queue, kernel cl.CL_kernel) { // Prepare input data as follows. // Build two arrays: // - an array that consists of the Element structures // (refer to svmbasic.h for the structure definition) // - an array that consists of the float values // // Each structure of the first array has the following pointers: // - 'internal', which points to a 'value' field of another entry // of the same array. // - 'external', which points to a float value from the the // second array. // // Pointers are set randomly. The structures do not reflect any real usage // scenario, but are illustrative for a simple device-side traversal. // // Array of Element Array of floats // structures // // ||====================|| // || ............. || ||============|| // || ............. ||<-----+ || .......... || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value <----||------+ || .......... || // ||====================|| | || .......... || // || ............. || | || float || // || ............. || | || float || // ||====================|| | || float || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value || || float || // ||====================|| || float || // || ............. || || .......... || // || ............. || ||============|| // ||====================|| // // The two arrays are created independently and are used to illustrate // two new OpenCL 2.0 API functions: // - the array of Element structures is passed to the kernel as a // kernel argument with the clSetKernelArgSVMPointer function // - the array of floats is used by the kernel indirectly, and this // dependency should be also specified with the clSetKernelExecInfo // function prior to the kernel execution var err cl.CL_int // To enable host & device code to share pointer to the same address space // the arrays should be allocated as SVM memory. Use the clSVMAlloc function // to allocate SVM memory. // // Optionally, this function allows specifying alignment in bytes as its // last argument. As this basic example doesn't require any _special_ alignment, // the following code illustrates requesting default alignment via passing // zero value. inputElements := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER, size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputElements { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputElements) inputFloats := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputFloats { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputFloats) // The OpenCL kernel uses the aforementioned input arrays to compute // values for the output array. output := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_WRITE_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) defer cl.CLSVMFree(context, output) if nil == output { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } // Note: in the coarse-grained SVM, mapping of inputElement and inputFloats is // needed to do the following initialization. While here, in the fine-grained SVM, // it is not necessary. // Populate data-structures with initial data. r := rand.New(rand.NewSource(99)) for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) inputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(i)*unsafe.Sizeof(sampleFloat))) randElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleElement))) randFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleFloat))) inputElement.internal = &(randElement.value) inputElement.external = randFloat inputElement.value = cl.CL_float(i) *inputFloat = cl.CL_float(i + size) } // Note: in the coarse-grained SVM, unmapping of inputElement and inputFloats is // needed before scheduling the kernel for execution. While here, in the fine-grained SVM, // it is not necessary. // Pass arguments to the kernel. // According to the OpenCL 2.0 specification, you need to use a special // function to pass a pointer from SVM memory to kernel. err = cl.CLSetKernelArgSVMPointer(kernel, 0, inputElements) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") err = cl.CLSetKernelArgSVMPointer(kernel, 1, output) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") // For buffer based SVM (both coarse- and fine-grain) if one SVM buffer // points to memory allocated in another SVM buffer, such allocations // should be passed to the kernel via clSetKernelExecInfo. err = cl.CLSetKernelExecInfo(kernel, cl.CL_KERNEL_EXEC_INFO_SVM_PTRS, cl.CL_size_t(unsafe.Sizeof(inputFloats)), unsafe.Pointer(&inputFloats)) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelExecInfo") // Run the kernel. println("Running kernel...") var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = size err = cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, globalWorkSize[:], nil, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel") // Note: In the fine-grained SVM, after enqueuing the kernel above, the host application is // not blocked from accessing SVM allocations that were passed to the kernel. The host // can access the same regions of SVM memory as does the kernel if the kernel and the host // read/modify different bytes. If one side (host or device) needs to modify the same bytes // that are simultaniously read/modified by another side, atomics operations are usually // required to maintain sufficient memory consistency. This sample doesn't use this possibility // and the host just waits in clFinish below until the kernel is finished. err = cl.CLFinish(queue) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLFinish") println(" DONE.") // Validate output state for correctness. // Compare: in the coarse-grained SVM case you need to map the output. // Here it is not needed. println("Checking correctness of the output buffer...") for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) outputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(output) + uintptr(i)*unsafe.Sizeof(sampleFloat))) expectedValue := *(inputElement.internal) + *(inputElement.external) if *outputFloat != expectedValue { println(" FAILED.") fmt.Printf("Mismatch at position %d, read %f, expected %f\n", i, *outputFloat, expectedValue) return } } println(" PASSED.") }
func (this *command_queue) Finish() error { if errCode := cl.CLFinish(this.command_queue_id); errCode != cl.CL_SUCCESS { return fmt.Errorf("Finish failure with errcode_ret %d: %s", errCode, cl.ERROR_CODES_STRINGS[-errCode]) } return nil }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and events */ var num_ints cl.CL_int var num_items [1]cl.CL_size_t var data [NUM_INTS]cl.CL_int var data_buffer cl.CL_mem var prof_event cl.CL_event var total_time cl.CL_ulong var time_start, time_end interface{} /* Initialize data */ for i := 0; i < NUM_INTS; i++ { data[i] = cl.CL_int(i) } /* Set number of data points and work-items */ num_ints = NUM_INTS num_items[0] = NUM_ITEMS /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create a kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { println("Couldn't create a kernel") return } /* Create a buffer to hold data */ data_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE|cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(data[0]))*NUM_INTS, unsafe.Pointer(&data[0]), &err) if err < 0 { println("Couldn't create a buffer") return } /* Create kernel argument */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(data_buffer)), unsafe.Pointer(&data_buffer)) if err < 0 { println("Couldn't set a kernel argument") return } cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(num_ints)), unsafe.Pointer(&num_ints)) /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], cl.CL_QUEUE_PROFILING_ENABLE, &err) if err < 0 { println("Couldn't create a command queue") return } total_time = 0.0 for i := 0; i < NUM_ITERATIONS; i++ { /* Enqueue kernel */ cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, num_items[:], nil, 0, nil, &prof_event) if err < 0 { println("Couldn't enqueue the kernel") return } /* Finish processing the queue and get profiling information */ cl.CLFinish(queue) cl.CLGetEventProfilingInfo(prof_event, cl.CL_PROFILING_COMMAND_START, cl.CL_size_t(unsafe.Sizeof(total_time)), &time_start, nil) cl.CLGetEventProfilingInfo(prof_event, cl.CL_PROFILING_COMMAND_END, cl.CL_size_t(unsafe.Sizeof(total_time)), &time_end, nil) total_time += time_end.(cl.CL_ulong) - time_start.(cl.CL_ulong) } fmt.Printf("Average time = %v\n", total_time/NUM_ITERATIONS) /* Deallocate resources */ cl.CLReleaseEvent(prof_event) cl.CLReleaseKernel(kernel) cl.CLReleaseMemObject(data_buffer) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func svmbasic(size cl.CL_size_t, context cl.CL_context, queue cl.CL_command_queue, kernel cl.CL_kernel) { // Prepare input data as follows. // Build two arrays: // - an array that consists of the Element structures // (refer to svmbasic.h for the structure definition) // - an array that consists of the float values // // Each structure of the first array has the following pointers: // - 'internal', which points to a 'value' field of another entry // of the same array. // - 'external', which points to a float value from the the // second array. // // Pointers are set randomly. The structures do not reflect any real usage // scenario, but are illustrative for a simple device-side traversal. // // Array of Element Array of floats // structures // // ||====================|| // || ............. || ||============|| // || ............. ||<-----+ || .......... || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value <----||------+ || .......... || // ||====================|| | || .......... || // || ............. || | || float || // || ............. || | || float || // ||====================|| | || float || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value || || float || // ||====================|| || float || // || ............. || || .......... || // || ............. || ||============|| // ||====================|| // // The two arrays are created independently and are used to illustrate // two new OpenCL 2.0 API functions: // - the array of Element structures is passed to the kernel as a // kernel argument with the clSetKernelArgSVMPointer function // - the array of floats is used by the kernel indirectly, and this // dependency should be also specified with the clSetKernelExecInfo // function prior to the kernel execution var err cl.CL_int // To enable host & device code to share pointer to the same address space // the arrays should be allocated as SVM memory. Use the clSVMAlloc function // to allocate SVM memory. // // Optionally, this function allows specifying alignment in bytes as its // last argument. As this basic example doesn't require any _special_ alignment, // the following code illustrates requesting default alignment via passing // zero value. inputElements := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY, size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputElements { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputElements) inputFloats := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputFloats { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputFloats) // The OpenCL kernel uses the aforementioned input arrays to compute // values for the output array. output := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_WRITE_ONLY, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) defer cl.CLSVMFree(context, output) if nil == output { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } // In the coarse-grained buffer SVM model, only one OpenCL device (or // host) can have ownership for writing to the buffer. Specifically, host // explicitly requests the ownership by mapping/unmapping the SVM buffer. // // So to fill the input SVM buffers on the host, you need to map them to have // access from the host program. // // The following two map calls are required in case of coarse-grained SVM only. err = cl.CLEnqueueSVMMap(queue, cl.CL_TRUE, // blocking map cl.CL_MAP_WRITE, inputElements, size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap") err = cl.CLEnqueueSVMMap(queue, cl.CL_TRUE, // blocking map cl.CL_MAP_WRITE, inputFloats, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap") // Populate data-structures with initial data. r := rand.New(rand.NewSource(99)) for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) inputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(i)*unsafe.Sizeof(sampleFloat))) randElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleElement))) randFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleFloat))) inputElement.internal = &(randElement.value) inputElement.external = randFloat inputElement.value = cl.CL_float(i) *inputFloat = cl.CL_float(i + size) } // The following two unmap calls are required in case of coarse-grained SVM only err = cl.CLEnqueueSVMUnmap(queue, inputElements, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap") err = cl.CLEnqueueSVMUnmap(queue, inputFloats, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap") // Pass arguments to the kernel. // According to the OpenCL 2.0 specification, you need to use a special // function to pass a pointer from SVM memory to kernel. err = cl.CLSetKernelArgSVMPointer(kernel, 0, inputElements) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") err = cl.CLSetKernelArgSVMPointer(kernel, 1, output) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") // For buffer based SVM (both coarse- and fine-grain) if one SVM buffer // points to memory allocated in another SVM buffer, such allocations // should be passed to the kernel via clSetKernelExecInfo. err = cl.CLSetKernelExecInfo(kernel, cl.CL_KERNEL_EXEC_INFO_SVM_PTRS, cl.CL_size_t(unsafe.Sizeof(inputFloats)), unsafe.Pointer(&inputFloats)) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelExecInfo") // Run the kernel. println("Running kernel...") var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = size err = cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, globalWorkSize[:], nil, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel") // Map the output SVM buffer to read the results. // Mapping is required for coarse-grained SVM only. err = cl.CLEnqueueSVMMap(queue, cl.CL_TRUE, // blocking map cl.CL_MAP_READ, output, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap") println(" DONE.") // Validate output state for correctness. println("Checking correctness of the output buffer...") for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) outputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(output) + uintptr(i)*unsafe.Sizeof(sampleFloat))) expectedValue := *(inputElement.internal) + *(inputElement.external) if *outputFloat != expectedValue { println(" FAILED.") fmt.Printf("Mismatch at position %d, read %f, expected %f\n", i, *outputFloat, expectedValue) return } } println(" PASSED.") err = cl.CLEnqueueSVMUnmap(queue, output, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap") err = cl.CLFinish(queue) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLFinish") }