/* Create program from a file and compile it */ func Build_program(context cl.CL_context, device []cl.CL_device_id, filename string, options []byte) *cl.CL_program { var program cl.CL_program //var program_handle; var program_buffer [1][]byte var program_log interface{} var program_size [1]cl.CL_size_t var log_size cl.CL_size_t var err cl.CL_int /* Read each program file and place content into buffer array */ program_handle, err1 := os.Open(filename) if err1 != nil { fmt.Printf("Couldn't find the program file %s\n", filename) return nil } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { fmt.Printf("Couldn't find the program stat\n") return nil } program_size[0] = cl.CL_size_t(fi.Size()) program_buffer[0] = make([]byte, program_size[0]) read_size, err3 := program_handle.Read(program_buffer[0]) if err3 != nil || cl.CL_size_t(read_size) != program_size[0] { fmt.Printf("read file error or file size wrong\n") return nil } /* Create a program containing all program content */ program = cl.CLCreateProgramWithSource(context, 1, program_buffer[:], program_size[:], &err) if err < 0 { fmt.Printf("Couldn't create the program\n") } /* Build program */ err = cl.CLBuildProgram(program, 1, device[:], options, nil, nil) if err < 0 { /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) fmt.Printf("%s\n", program_log) return nil } return &program }
func TestQueue(t *testing.T) { /* Host/device data structures */ var platform [1]cl.CL_platform_id var device [1]cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var err cl.CL_int /* Program/kernel data structures */ var program cl.CL_program var program_buffer [1][]byte var program_log interface{} var program_size [1]cl.CL_size_t var log_size cl.CL_size_t var kernel cl.CL_kernel /* Access the first installed platform */ err = cl.CLGetPlatformIDs(1, platform[:], nil) if err < 0 { t.Errorf("Couldn't find any platforms") } /* Access the first GPU/CPU */ err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_GPU, 1, device[:], nil) if err == cl.CL_DEVICE_NOT_FOUND { err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_CPU, 1, device[:], nil) } if err < 0 { t.Errorf("Couldn't find any devices") } /* Create a context */ context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { t.Errorf("Couldn't create a context") } /* Read each program file and place content into buffer array */ program_handle, err1 := os.Open("blank.cl") if err1 != nil { t.Errorf("Couldn't find the program file") } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { t.Errorf("Couldn't find the program stat") } program_size[0] = cl.CL_size_t(fi.Size()) program_buffer[0] = make([]byte, program_size[0]) read_size, err3 := program_handle.Read(program_buffer[0]) if err3 != nil || cl.CL_size_t(read_size) != program_size[0] { t.Errorf("read file error or file size wrong") } /* Create program from file */ program = cl.CLCreateProgramWithSource(context, 1, program_buffer[:], program_size[:], &err) if err < 0 { t.Errorf("Couldn't create the program") } /* Build program */ err = cl.CLBuildProgram(program, 1, device[:], nil, nil, nil) if err < 0 { /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) //program_log = (char*) malloc(log_size+1); //program_log[log_size] = '\0'; cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) t.Errorf("%s\n", program_log) //free(program_log); } /* Create the kernel */ kernel = cl.CLCreateKernel(program, []byte("blank"), &err) if err < 0 { t.Errorf("Couldn't create the kernel") } /* Create the command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { t.Errorf("Couldn't create the command queue") } /* Enqueue the kernel execution command */ err = cl.CLEnqueueTask(queue, kernel, 0, nil, nil) if err < 0 { t.Errorf("Couldn't enqueue the kernel execution command") } else { t.Logf("Successfully queued kernel.\n") } /* Deallocate resources */ cl.CLReleaseCommandQueue(queue) cl.CLReleaseKernel(kernel) cl.CLReleaseProgram(program) cl.CLReleaseContext(context) }
func main() { // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint var platforms []cl.CL_platform_id // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) // Allocate enough space for each platform platforms = make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") //----------------------------------------------------- // STEP 2: Discover and initialize the GPU devices //----------------------------------------------------- var numDevices cl.CL_uint var devices []cl.CL_device_id // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") // Allocate enough space for each device devices = make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, numDevices, devices, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- var context cl.CL_context // Create a context using clCreateContext() and // associate it with the devices context = cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext") defer cl.CLReleaseContext(context) //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- var commandQueue [MAX_COMMAND_QUEUE]cl.CL_command_queue // Create a command queue using clCreateCommandQueueWithProperties(), // and associate it with the device you want to execute for i := 0; i < MAX_COMMAND_QUEUE; i++ { commandQueue[i] = cl.CLCreateCommandQueueWithProperties(context, devices[0], nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties") defer cl.CLReleaseCommandQueue(commandQueue[i]) } //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- producerGroupSize := cl.CL_size_t(PRODUCER_GROUP_SIZE) producerGlobalSize := cl.CL_size_t(PRODUCER_GLOBAL_SIZE) consumerGroupSize := cl.CL_size_t(CONSUMER_GROUP_SIZE) consumerGlobalSize := cl.CL_size_t(CONSUMER_GLOBAL_SIZE) var samplePipePkt [2]cl.CL_float szPipe := cl.CL_uint(PIPE_SIZE) szPipePkt := cl.CL_uint(unsafe.Sizeof(samplePipePkt)) if szPipe%PRNG_CHANNELS != 0 { szPipe = (szPipe/PRNG_CHANNELS)*PRNG_CHANNELS + PRNG_CHANNELS } consumerGlobalSize = cl.CL_size_t(szPipe) pipePktPerThread := cl.CL_int(szPipe) / PRNG_CHANNELS seed := cl.CL_int(SEED) rngType := cl.CL_int(RV_GAUSSIAN) var histMin cl.CL_float var histMax cl.CL_float if rngType == cl.CL_int(RV_UNIFORM) { histMin = 0.0 histMax = 1.0 } else { histMin = -10.0 histMax = 10.0 } localDevHist := make([]cl.CL_int, MAX_HIST_BINS) cpuHist := make([]cl.CL_int, MAX_HIST_BINS) //Create and initialize memory objects rngPipe := cl.CLCreatePipe(context, cl.CL_MEM_READ_WRITE, szPipePkt, szPipe, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreatePipe") devHist := cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE|cl.CL_MEM_COPY_HOST_PTR, MAX_HIST_BINS*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])), unsafe.Pointer(&localDevHist[0]), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreateBuffer") //----------------------------------------------------- // STEP 6: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("pipe.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource") defer cl.CLReleaseProgram(program) // Build (compile) the program for the devices with // clBuildProgram() options := "-cl-std=CL2.0" status = cl.CLBuildProgram(program, numDevices, devices, []byte(options), nil, nil) if status != cl.CL_SUCCESS { var program_log interface{} var log_size cl.CL_size_t /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) fmt.Printf("%s\n", program_log) return } //utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram") //----------------------------------------------------- // STEP 7: Create the kernel //----------------------------------------------------- // Use clCreateKernel() to create a kernel produceKernel := cl.CLCreateKernel(program, []byte("pipe_producer"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(produceKernel) consumeKernel := cl.CLCreateKernel(program, []byte("pipe_consumer"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(consumeKernel) //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArg(produceKernel, 0, cl.CL_size_t(unsafe.Sizeof(rngPipe)), unsafe.Pointer(&rngPipe)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)") status = cl.CLSetKernelArg(produceKernel, 1, cl.CL_size_t(unsafe.Sizeof(pipePktPerThread)), unsafe.Pointer(&pipePktPerThread)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(pipePktPerThread)") status = cl.CLSetKernelArg(produceKernel, 2, cl.CL_size_t(unsafe.Sizeof(seed)), unsafe.Pointer(&seed)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(seed)") status = cl.CLSetKernelArg(produceKernel, 3, cl.CL_size_t(unsafe.Sizeof(rngType)), unsafe.Pointer(&rngType)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngType)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. // Enqueue both the kernels. var globalThreads = []cl.CL_size_t{producerGlobalSize} var localThreads = []cl.CL_size_t{producerGroupSize} //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items var produceEvt [1]cl.CL_event status = cl.CLEnqueueNDRangeKernel(commandQueue[0], produceKernel, 1, nil, globalThreads, localThreads, 0, nil, &produceEvt[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel") /* launch consumer kernel only after producer has finished. This is done to avoid concurrent kernels execution as the memory consistency of pipe is guaranteed only across synchronization points. */ status = cl.CLWaitForEvents(1, produceEvt[:]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clWaitForEvents(produceEvt)") //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArg(consumeKernel, 0, cl.CL_size_t(unsafe.Sizeof(rngPipe)), unsafe.Pointer(&rngPipe)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)") status = cl.CLSetKernelArg(consumeKernel, 1, cl.CL_size_t(unsafe.Sizeof(devHist)), unsafe.Pointer(&devHist)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(devHist)") status = cl.CLSetKernelArg(consumeKernel, 2, cl.CL_size_t(unsafe.Sizeof(histMin)), unsafe.Pointer(&histMin)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMin)") status = cl.CLSetKernelArg(consumeKernel, 3, cl.CL_size_t(unsafe.Sizeof(histMax)), unsafe.Pointer(&histMax)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMax)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. globalThreads[0] = consumerGlobalSize localThreads[0] = consumerGroupSize //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items var consumeEvt [1]cl.CL_event status = cl.CLEnqueueNDRangeKernel( commandQueue[1], consumeKernel, 1, nil, globalThreads, localThreads, 0, nil, &consumeEvt[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel") status = cl.CLFlush(commandQueue[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(0)") status = cl.CLFlush(commandQueue[1]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(1)") //wait for kernels to finish status = cl.CLFinish(commandQueue[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(0)") status = cl.CLFinish(commandQueue[1]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(1)") //----------------------------------------------------- // STEP 11: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) //copy the data back to host buffer var readEvt cl.CL_event status = cl.CLEnqueueReadBuffer(commandQueue[1], devHist, cl.CL_TRUE, 0, (MAX_HIST_BINS)*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])), unsafe.Pointer(&localDevHist[0]), 0, nil, &readEvt) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueReadBuffer") //----------------------------------------------------- // STEP 12: Verify the results //----------------------------------------------------- //Find the tolerance limit fTol := (float32)(CONSUMER_GLOBAL_SIZE) * (float32)(COMP_TOL) / (float32)(100.0) iTol := (int)(fTol) if iTol == 0 { iTol = 1 } //CPU side histogram computation CPUReference(seed, pipePktPerThread, rngType, cpuHist, histMax, histMin) //Compare for bin := 0; bin < MAX_HIST_BINS; bin++ { diff := int(localDevHist[bin] - cpuHist[bin]) if diff < 0 { diff = -diff } if diff > iTol { println("Failed!") return } } println("Passed!") }
func main() { // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint var platforms []cl.CL_platform_id // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) // Allocate enough space for each platform platforms = make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") //----------------------------------------------------- // STEP 2: Discover and initialize the GPU devices //----------------------------------------------------- var numDevices cl.CL_uint var devices []cl.CL_device_id // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") // Allocate enough space for each device devices = make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, numDevices, devices, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- var context cl.CL_context // Create a context using clCreateContext() and // associate it with the devices context = cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext") defer cl.CLReleaseContext(context) //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- var cmdQueue cl.CL_command_queue // Create a command queue using clCreateCommandQueueWithProperties(), // and associate it with the device you want to execute cmdQueue = cl.CLCreateCommandQueueWithProperties(context, devices[0], nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties") defer cl.CLReleaseCommandQueue(cmdQueue) //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- // initialize any device/SVM memory here. /* svm buffer for binary tree */ svmTreeBuf := cl.CLSVMAlloc(context, cl.CL_MEM_READ_WRITE, cl.CL_size_t(NUMBER_OF_NODES*unsafe.Sizeof(sampleNode)), 0) if nil == svmTreeBuf { println("clSVMAlloc(svmTreeBuf) failed.") return } defer cl.CLSVMFree(context, svmTreeBuf) /* svm buffer for search keys */ svmSearchBuf := cl.CLSVMAlloc(context, cl.CL_MEM_READ_WRITE, cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)), 0) if nil == svmSearchBuf { println("clSVMAlloc(svmSearchBuf) failed.") return } defer cl.CLSVMFree(context, svmSearchBuf) //create the binary tree and set the root /* root node of the binary tree */ svmRoot := cpuCreateBinaryTree(cmdQueue, svmTreeBuf) //initialize search keys cpuInitSearchKeys(cmdQueue, svmSearchBuf) /* if voice is not deliberately muzzled, shout parameters */ fmt.Printf("-------------------------------------------------------------------------\n") fmt.Printf("Searching %d keys in a BST having %d Nodes...\n", NUMBER_OF_SEARCH_KEY, NUMBER_OF_NODES) fmt.Printf("-------------------------------------------------------------------------\n") //----------------------------------------------------- // STEP 6: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("bst.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource") defer cl.CLReleaseProgram(program) // Build (compile) the program for the devices with // clBuildProgram() options := "-cl-std=CL2.0" status = cl.CLBuildProgram(program, numDevices, devices, []byte(options), nil, nil) if status != cl.CL_SUCCESS { var program_log interface{} var log_size cl.CL_size_t /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) fmt.Printf("%s\n", program_log) return } //utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram") //----------------------------------------------------- // STEP 7: Create the kernel //----------------------------------------------------- var kernel cl.CL_kernel // Use clCreateKernel() to create a kernel kernel = cl.CLCreateKernel(program, []byte("bst_kernel"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(kernel) //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArgSVMPointer(kernel, 0, svmTreeBuf) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArgSVMPointer(svmTreeBuf)") status = cl.CLSetKernelArgSVMPointer(kernel, 1, svmSearchBuf) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArgSVMPointer(svmSearchBuf)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. var localWorkSize [1]cl.CL_size_t var kernelWorkGroupSize interface{} status = cl.CLGetKernelWorkGroupInfo(kernel, devices[0], cl.CL_KERNEL_WORK_GROUP_SIZE, cl.CL_size_t(unsafe.Sizeof(localWorkSize[0])), &kernelWorkGroupSize, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetKernelWorkGroupInfo") localWorkSize[0] = kernelWorkGroupSize.(cl.CL_size_t) var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = NUMBER_OF_SEARCH_KEY //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items status = cl.CLEnqueueNDRangeKernel(cmdQueue, kernel, 1, nil, globalWorkSize[:], localWorkSize[:], 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel") status = cl.CLFlush(cmdQueue) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush") status = cl.CLFinish(cmdQueue) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish") //----------------------------------------------------- // STEP 11: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) //copy the data back to host buffer //this demo doesn't need clEnqueueReadBuffer due to SVM //----------------------------------------------------- // STEP 12: Verify the results //----------------------------------------------------- // reference implementation svmBinaryTreeCPUReference(cmdQueue, svmRoot, svmTreeBuf, svmSearchBuf) // compare the results and see if they match pass := svmCompareResults(cmdQueue, svmSearchBuf) if pass { println("Passed!") } else { println("Failed!") } }
func main() { // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") // Allocate enough space for each platform platforms := make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") //----------------------------------------------------- // STEP 2: Discover and initialize the GPU devices //----------------------------------------------------- var numDevices cl.CL_uint // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") // Allocate enough space for each device devices := make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, numDevices, devices, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") var caps cl.CL_device_svm_capabilities var caps_value interface{} status = cl.CLGetDeviceInfo( devices[0], cl.CL_DEVICE_SVM_CAPABILITIES, cl.CL_size_t(unsafe.Sizeof(caps)), &caps_value, nil) caps = caps_value.(cl.CL_device_svm_capabilities) // Coarse-grained buffer SVM should be available on any OpenCL 2.0 device. // So it is either not an OpenCL 2.0 device or it must support coarse-grained buffer SVM: if !(status == cl.CL_SUCCESS && (caps&cl.CL_DEVICE_SVM_FINE_GRAIN_BUFFER) != 0) { fmt.Printf("Cannot detect fine-grained buffer SVM capabilities on the device. The device seemingly doesn't support fine-grained buffer SVM. caps=%x\n", caps) println("") return } //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- // Create a context using clCreateContext() and // associate it with the devices context := cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext") defer cl.CLReleaseContext(context) //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- // Create a command queue using clCreateCommandQueueWithProperties(), // and associate it with the device you want to execute queue := cl.CLCreateCommandQueueWithProperties(context, devices[0], nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties") defer cl.CLReleaseCommandQueue(queue) //----------------------------------------------------- // STEP 5: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("svmfg.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource") defer cl.CLReleaseProgram(program) // Build (compile) the program for the devices with // clBuildProgram() options := "-cl-std=CL2.0" status = cl.CLBuildProgram(program, numDevices, devices, []byte(options), nil, nil) if status != cl.CL_SUCCESS { var log interface{} var log_size cl.CL_size_t /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &log, nil) fmt.Printf("%s\n", log) return } //utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram") //----------------------------------------------------- // STEP 7: Create the kernel //----------------------------------------------------- // Use clCreateKernel() to create a kernel kernel := cl.CLCreateKernel(program, []byte("svmbasic"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(kernel) // Then call the main sample routine - resource allocations, OpenCL kernel // execution, and so on. svmbasic(1024*1024, context, queue, kernel) // All resource deallocations happen in defer. }
func TestMatvec(t *testing.T) { /* Host/device data structures */ var platform [1]cl.CL_platform_id var device [1]cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var i, err cl.CL_int /* Program/kernel data structures */ var program cl.CL_program var program_buffer [1][]byte var program_log interface{} var program_size [1]cl.CL_size_t var log_size cl.CL_size_t var kernel cl.CL_kernel /* Data and buffers */ var mat [16]float32 var vec, result [4]float32 var correct = [4]float32{0.0, 0.0, 0.0, 0.0} var mat_buff, vec_buff, res_buff cl.CL_mem /* Initialize data to be processed by the kernel */ for i = 0; i < 16; i++ { mat[i] = float32(i) * 2.0 } for i = 0; i < 4; i++ { vec[i] = float32(i) * 3.0 correct[0] += mat[i] * vec[i] correct[1] += mat[i+4] * vec[i] correct[2] += mat[i+8] * vec[i] correct[3] += mat[i+12] * vec[i] } /* Identify a platform */ err = cl.CLGetPlatformIDs(1, platform[:], nil) if err < 0 { t.Errorf("Couldn't find any platforms") } /* Access a device */ err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_GPU, 1, device[:], nil) if err < 0 { err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_CPU, 1, device[:], nil) if err < 0 { t.Errorf("Couldn't find any devices") } } /* Create the context */ context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { t.Errorf("Couldn't create a context") } /* Read program file and place content into buffer */ program_handle, err1 := os.Open("matvec.cl") if err1 != nil { t.Errorf("Couldn't find the program file") } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { t.Errorf("Couldn't find the program stat") } program_size[0] = cl.CL_size_t(fi.Size()) program_buffer[0] = make([]byte, program_size[0]) read_size, err3 := program_handle.Read(program_buffer[0]) if err3 != nil || cl.CL_size_t(read_size) != program_size[0] { t.Errorf("read file error or file size wrong") } /* Create a program containing all program content */ program = cl.CLCreateProgramWithSource(context, 1, program_buffer[:], program_size[:], &err) if err < 0 { t.Errorf("Couldn't create the program") } /* Build program */ err = cl.CLBuildProgram(program, 1, device[:], nil, nil, nil) if err < 0 { /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) //program_log = (char*) malloc(log_size+1); //program_log[log_size] = '\0'; cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) t.Errorf("%s\n", program_log) //free(program_log); } /* Create kernel for the mat_vec_mult function */ kernel = cl.CLCreateKernel(program, []byte("matvec_mult"), &err) if err < 0 { t.Errorf("Couldn't create the kernel") return } /* Create CL buffers to hold input and output data */ mat_buff = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(mat)), unsafe.Pointer(&mat[0]), &err) if err < 0 { t.Errorf("Couldn't create a buffer object") return } vec_buff = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(vec)), unsafe.Pointer(&vec[0]), nil) res_buff = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, cl.CL_size_t(unsafe.Sizeof(result)), nil, nil) /* Create kernel arguments from the CL buffers */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(mat_buff)), unsafe.Pointer(&mat_buff)) if err < 0 { t.Errorf("Couldn't set the kernel argument") return } cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(vec_buff)), unsafe.Pointer(&vec_buff)) cl.CLSetKernelArg(kernel, 2, cl.CL_size_t(unsafe.Sizeof(res_buff)), unsafe.Pointer(&res_buff)) /* Create a CL command queue for the device*/ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { t.Errorf("Couldn't create the command queue, errcode=%d\n", err) return } /* Enqueue the command queue to the device */ var work_units_per_kernel = [1]cl.CL_size_t{4} /* 4 work-units per kernel */ err = cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, work_units_per_kernel[:], nil, 0, nil, nil) if err < 0 { t.Errorf("Couldn't enqueue the kernel execution command, errcode=%d\n", err) return } /* Read the result */ err = cl.CLEnqueueReadBuffer(queue, res_buff, cl.CL_TRUE, 0, cl.CL_size_t(unsafe.Sizeof(result)), unsafe.Pointer(&result[0]), 0, nil, nil) if err < 0 { t.Errorf("Couldn't enqueue the read buffer command") return } /* Test the result */ if (result[0] == correct[0]) && (result[1] == correct[1]) && (result[2] == correct[2]) && (result[3] == correct[3]) { t.Logf("Matrix-vector multiplication successful.") } else { t.Errorf("Matrix-vector multiplication unsuccessful.") } /* Deallocate resources */ cl.CLReleaseMemObject(mat_buff) cl.CLReleaseMemObject(vec_buff) cl.CLReleaseMemObject(res_buff) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(program) cl.CLReleaseContext(context) }
func TestProgram(t *testing.T) { /* Host/device data structures */ var platform [1]cl.CL_platform_id var device [1]cl.CL_device_id var context cl.CL_context var i, err cl.CL_int /* Program data structures */ var program cl.CL_program var program_buffer [NUM_FILES][]byte var program_log interface{} var file_name = []string{"bad.cl", "good.cl"} options := "-cl-finite-math-only -cl-no-signed-zeros" var program_size [NUM_FILES]cl.CL_size_t var log_size cl.CL_size_t /* Access the first installed platform */ err = cl.CLGetPlatformIDs(1, platform[:], nil) if err < 0 { t.Errorf("Couldn't find any platforms") } /* Access the first GPU/CPU */ err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_GPU, 1, device[:], nil) if err == cl.CL_DEVICE_NOT_FOUND { err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_CPU, 1, device[:], nil) } if err < 0 { t.Errorf("Couldn't find any devices") } /* Create a context */ context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { t.Errorf("Couldn't create a context") } /* Read each program file and place content into buffer array */ for i = 0; i < NUM_FILES; i++ { program_handle, err := os.Open(file_name[i]) if err != nil { t.Errorf("Couldn't find the program file") } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { t.Errorf("Couldn't find the program stat") } program_size[i] = cl.CL_size_t(fi.Size()) program_buffer[i] = make([]byte, program_size[i]) read_size, err3 := program_handle.Read(program_buffer[i]) if err3 != nil || cl.CL_size_t(read_size) != program_size[i] { t.Errorf("read file error or file size wrong") } } /* Create a program containing all program content */ program = cl.CLCreateProgramWithSource(context, NUM_FILES, program_buffer[:], program_size[:], &err) if err < 0 { t.Errorf("Couldn't create the program") } /* Build program */ err = cl.CLBuildProgram(program, 1, device[:], []byte(options), nil, nil) if err < 0 { /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) //program_log = (char*) malloc(log_size+1); //program_log[log_size] = '\0'; cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) t.Errorf("%s\n", program_log) //free(program_log); } /* Deallocate resources */ //for(i=0; i<NUM_FILES; i++) { // free(program_buffer[i]); //} cl.CLReleaseProgram(program) cl.CLReleaseContext(context) }
func TestKernel(t *testing.T) { /* Host/device data structures */ var platform [1]cl.CL_platform_id var device [1]cl.CL_device_id var context cl.CL_context var err cl.CL_int /* Program data structures */ var program cl.CL_program var program_buffer [1][]byte var program_log interface{} var program_size [1]cl.CL_size_t var log_size cl.CL_size_t var kernels []cl.CL_kernel var found bool var i, num_kernels cl.CL_uint /* Access the first installed platform */ err = cl.CLGetPlatformIDs(1, platform[:], nil) if err < 0 { t.Errorf("Couldn't find any platforms") } /* Access the first GPU/CPU */ err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_GPU, 1, device[:], nil) if err == cl.CL_DEVICE_NOT_FOUND { err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_CPU, 1, device[:], nil) } if err < 0 { t.Errorf("Couldn't find any devices") } /* Create a context */ context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { t.Errorf("Couldn't create a context") } /* Read each program file and place content into buffer array */ program_handle, err1 := os.Open("test.cl") if err1 != nil { t.Errorf("Couldn't find the program file") } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { t.Errorf("Couldn't find the program stat") } program_size[0] = cl.CL_size_t(fi.Size()) program_buffer[0] = make([]byte, program_size[0]) read_size, err3 := program_handle.Read(program_buffer[0]) if err3 != nil || cl.CL_size_t(read_size) != program_size[0] { t.Errorf("read file error or file size wrong") } /* Create a program containing all program content */ program = cl.CLCreateProgramWithSource(context, 1, program_buffer[:], program_size[:], &err) if err < 0 { t.Errorf("Couldn't create the program") } /* Build program */ err = cl.CLBuildProgram(program, 1, device[:], nil, nil, nil) if err < 0 { /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) //program_log = (char*) malloc(log_size+1); //program_log[log_size] = '\0'; cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) t.Errorf("%s\n", program_log) //free(program_log); } /* Find out how many kernels are in the source file */ err = cl.CLCreateKernelsInProgram(program, 0, nil, &num_kernels) if err < 0 { t.Errorf("Couldn't find any kernels") } else { t.Logf("num_kernels = %d\n", num_kernels) } /* Create a kernel for each function */ kernels = make([]cl.CL_kernel, num_kernels) err = cl.CLCreateKernelsInProgram(program, num_kernels, kernels, nil) if err < 0 { t.Errorf("Couldn't create kernels") } /* Search for the named kernel */ for i = 0; i < num_kernels; i++ { var kernel_name_size cl.CL_size_t var kernel_name interface{} err = cl.CLGetKernelInfo(kernels[i], cl.CL_KERNEL_FUNCTION_NAME, 0, nil, &kernel_name_size) if err < 0 { t.Errorf("Couldn't get kernel size of name, errcode=%d\n", err) } err = cl.CLGetKernelInfo(kernels[i], cl.CL_KERNEL_FUNCTION_NAME, kernel_name_size, &kernel_name, nil) if err < 0 { t.Errorf("Couldn't get kernel info of name, errcode=%d\n", err) } if kernel_name.(string) == "mult" { found = true t.Logf("Found mult kernel at index %d.\n", i) break } } if !found { t.Errorf("Not found mult kernel\n") } for i = 0; i < num_kernels; i++ { cl.CLReleaseKernel(kernels[i]) } cl.CLReleaseProgram(program) cl.CLReleaseContext(context) }