func Read_image_data(filename string) (data []uint16, w, h cl.CL_size_t, err error) { reader, err1 := os.Open(filename) if err1 != nil { return nil, 0, 0, errors.New("Can't read input image file: " + filename) } defer reader.Close() m, _, err2 := image.Decode(reader) if err2 != nil { return nil, 0, 0, errors.New("Can't decode input image file") } bounds := m.Bounds() w = cl.CL_size_t(bounds.Max.X - bounds.Min.X) h = cl.CL_size_t(bounds.Max.Y - bounds.Min.Y) /* Allocate memory and read image data */ data = make([]uint16, h*w) for y := bounds.Min.Y; y < bounds.Max.Y; y++ { for x := bounds.Min.X; x < bounds.Max.X; x++ { r, _, _, _ := m.At(x, y).RGBA() data[(y-bounds.Min.Y)*int(w)+(x-bounds.Min.X)] = uint16(r) } } return data, w, h, err }
/* Create program from a file and compile it */ func Load_programsource(filename string) ([][]byte, []cl.CL_size_t) { var program_buffer [1][]byte var program_size [1]cl.CL_size_t /* Read each program file and place content into buffer array */ program_handle, err1 := os.Open(filename) if err1 != nil { fmt.Printf("Couldn't find the program file %s\n", filename) return nil, nil } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { fmt.Printf("Couldn't find the program stat\n") return nil, nil } program_size[0] = cl.CL_size_t(fi.Size()) program_buffer[0] = make([]byte, program_size[0]) read_size, err3 := program_handle.Read(program_buffer[0]) if err3 != nil || cl.CL_size_t(read_size) != program_size[0] { fmt.Printf("read file error or file size wrong\n") return nil, nil } return program_buffer[:], program_size[:] }
/* Create program from a file and compile it */ func Build_program(context cl.CL_context, device []cl.CL_device_id, filename string, options []byte) *cl.CL_program { var program cl.CL_program //var program_handle; var program_buffer [1][]byte var program_log interface{} var program_size [1]cl.CL_size_t var log_size cl.CL_size_t var err cl.CL_int /* Read each program file and place content into buffer array */ program_handle, err1 := os.Open(filename) if err1 != nil { fmt.Printf("Couldn't find the program file %s\n", filename) return nil } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { fmt.Printf("Couldn't find the program stat\n") return nil } program_size[0] = cl.CL_size_t(fi.Size()) program_buffer[0] = make([]byte, program_size[0]) read_size, err3 := program_handle.Read(program_buffer[0]) if err3 != nil || cl.CL_size_t(read_size) != program_size[0] { fmt.Printf("read file error or file size wrong\n") return nil } /* Create a program containing all program content */ program = cl.CLCreateProgramWithSource(context, 1, program_buffer[:], program_size[:], &err) if err < 0 { fmt.Printf("Couldn't create the program\n") } /* Build program */ err = cl.CLBuildProgram(program, 1, device[:], options, nil, nil) if err < 0 { /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) fmt.Printf("%s\n", program_log) return nil } return &program }
func cpuInitSearchKeys(commandQueue cl.CL_command_queue, svmSearchBuf unsafe.Pointer) { var nextData *searchKey var status cl.CL_int status = cl.CLEnqueueSVMMap(commandQueue, cl.CL_TRUE, //blocking call cl.CL_MAP_WRITE_INVALIDATE_REGION, svmSearchBuf, cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)), 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmSearchBuf)") r := rand.New(rand.NewSource(999)) // initialize nodes for i := 0; i < NUMBER_OF_SEARCH_KEY; i++ { nextData = (*searchKey)(unsafe.Pointer(uintptr(svmSearchBuf) + uintptr(i)*unsafe.Sizeof(sampleKey))) // allocate a random value to node nextData.key = cl.CL_int(r.Int()) // all pointers are null nextData.oclNode = nil nextData.nativeNode = nil } status = cl.CLEnqueueSVMUnmap(commandQueue, svmSearchBuf, 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmSearchBuf)") }
func svmCompareResults(commandQueue cl.CL_command_queue, svmSearchBuf unsafe.Pointer) bool { var compare_status bool var status cl.CL_int status = cl.CLEnqueueSVMMap(commandQueue, cl.CL_TRUE, //blocking call cl.CL_MAP_WRITE_INVALIDATE_REGION, svmSearchBuf, cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)), 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmSearchBuf)") compare_status = true for i := 0; i < NUMBER_OF_SEARCH_KEY; i++ { currKey := (*searchKey)(unsafe.Pointer(uintptr(svmSearchBuf) + uintptr(i)*unsafe.Sizeof(sampleKey))) /* compare OCL and native nodes */ if currKey.oclNode != currKey.nativeNode { compare_status = false break } } status = cl.CLEnqueueSVMUnmap(commandQueue, svmSearchBuf, 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmSearchBuf)") return compare_status }
/** * cpuCreateBinaryTree() * creates a tree from the data in "svmTreeBuf". If this is NULL returns NULL * else returns root of the tree. **/ func cpuCreateBinaryTree(commandQueue cl.CL_command_queue, svmTreeBuf unsafe.Pointer) *node { var root *node var status cl.CL_int // reserve svm space for CPU update status = cl.CLEnqueueSVMMap(commandQueue, cl.CL_TRUE, //blocking call cl.CL_MAP_WRITE_INVALIDATE_REGION, svmTreeBuf, cl.CL_size_t(NUMBER_OF_NODES*unsafe.Sizeof(sampleNode)), 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmTreeBuf)") //init node and make bt root = cpuMakeBinaryTree(svmTreeBuf) status = cl.CLEnqueueSVMUnmap(commandQueue, svmTreeBuf, 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmTreeBuf)") return root }
func main() { /* Host/device data structures */ var platform [1]cl.CL_platform_id var device [1]cl.CL_device_id var flag interface{} //cl.CL_device_fp_config; var err cl.CL_int /* Identify a platform */ err = cl.CLGetPlatformIDs(1, platform[:], nil) if err < 0 { println("Couldn't identify a platform") return } /* Access a device */ err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_GPU, 1, device[:], nil) if err == cl.CL_DEVICE_NOT_FOUND { err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_CPU, 1, device[:], nil) } if err < 0 { println("Couldn't access any devices") return } /* Check float-processing features */ err = cl.CLGetDeviceInfo(device[0], cl.CL_DEVICE_SINGLE_FP_CONFIG, cl.CL_size_t(unsafe.Sizeof(flag)), &flag, nil) if err < 0 { println("Couldn't read floating-point properties") return } fmt.Printf("Float Processing Features:\n") if (flag.(cl.CL_device_fp_config) & cl.CL_FP_INF_NAN) > 0 { fmt.Printf("INF and NaN values supported.\n") } if (flag.(cl.CL_device_fp_config) & cl.CL_FP_DENORM) > 0 { fmt.Printf("Denormalized numbers supported.\n") } if (flag.(cl.CL_device_fp_config) & cl.CL_FP_ROUND_TO_NEAREST) > 0 { fmt.Printf("Round To Nearest Even mode supported.\n") } if (flag.(cl.CL_device_fp_config) & cl.CL_FP_ROUND_TO_INF) > 0 { fmt.Printf("Round To Infinity mode supported.\n") } if (flag.(cl.CL_device_fp_config) & cl.CL_FP_ROUND_TO_ZERO) > 0 { fmt.Printf("Round To Zero mode supported.\n") } if (flag.(cl.CL_device_fp_config) & cl.CL_FP_FMA) > 0 { fmt.Printf("Floating-point multiply-and-add operation supported.\n") } if (flag.(cl.CL_device_fp_config) & cl.CL_FP_SOFT_FLOAT) > 0 { fmt.Printf("Basic floating-point processing performed in software.\n") } }
func main() { var i, j cl.CL_size_t // Rows and columns in the input image inputFile := "test.png" outputFile := "output.png" refFile := "ref.png" // Homegrown function to read a BMP from file inputpixels, imageWidth, imageHeight, err1 := utils.Read_image_data(inputFile) if err1 != nil { log.Fatal(err1) return } else { fmt.Printf("width=%d, height=%d (%d)\n", imageWidth, imageHeight, inputpixels[0]) } // Output image on the host outputpixels := make([]uint16, imageHeight*imageWidth) inputImage := make([]float32, imageHeight*imageWidth) outputImage := make([]float32, imageHeight*imageWidth) refImage := make([]float32, imageHeight*imageWidth) for i = 0; i < imageHeight*imageWidth; i++ { inputImage[i] = float32(inputpixels[i]) } // 45 degree motion blur var filter = [49]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0, 0, -2, 0, 2, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} // The convolution filter is 7x7 filterWidth := cl.CL_size_t(7) filterSize := cl.CL_size_t(filterWidth * filterWidth) // Assume a square kernel // Set up the OpenCL environment var status cl.CL_int // Discovery platform var platform [1]cl.CL_platform_id status = cl.CLGetPlatformIDs(1, platform[:], nil) chk(status, "clGetPlatformIDs") // Discover device var device [1]cl.CL_device_id cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_ALL, 1, device[:], nil) chk(status, "clGetDeviceIDs") // Create context //var props =[3]cl.CL_context_properties{cl.CL_CONTEXT_PLATFORM, // (cl.CL_context_properties)(unsafe.Pointer(&platform[0])), 0}; var context cl.CL_context context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &status) chk(status, "clCreateContext") // Create command queue var queue cl.CL_command_queue queue = cl.CLCreateCommandQueue(context, device[0], 0, &status) chk(status, "clCreateCommandQueue") // The image format describes how the data will be stored in memory var format cl.CL_image_format format.Image_channel_order = cl.CL_R // single channel format.Image_channel_data_type = cl.CL_FLOAT // float data type var desc cl.CL_image_desc desc.Image_type = cl.CL_MEM_OBJECT_IMAGE2D desc.Image_width = imageWidth desc.Image_height = imageHeight desc.Image_depth = 0 desc.Image_array_size = 0 desc.Image_row_pitch = 0 desc.Image_slice_pitch = 0 desc.Num_mip_levels = 0 desc.Num_samples = 0 desc.Buffer = cl.CL_mem{} // Create space for the source image on the device d_inputImage := cl.CLCreateImage(context, cl.CL_MEM_READ_ONLY, &format, &desc, nil, &status) chk(status, "clCreateImage") // Create space for the output image on the device d_outputImage := cl.CLCreateImage(context, cl.CL_MEM_WRITE_ONLY, &format, &desc, nil, &status) chk(status, "clCreateImage") // Create space for the 7x7 filter on the device d_filter := cl.CLCreateBuffer(context, 0, filterSize*cl.CL_size_t(unsafe.Sizeof(filter[0])), nil, &status) chk(status, "clCreateBuffer") // Copy the source image to the device var origin = [3]cl.CL_size_t{0, 0, 0} // Offset within the image to copy from var region = [3]cl.CL_size_t{cl.CL_size_t(imageWidth), cl.CL_size_t(imageHeight), 1} // Elements to per dimension status = cl.CLEnqueueWriteImage(queue, d_inputImage, cl.CL_FALSE, origin, region, 0, 0, unsafe.Pointer(&inputImage[0]), 0, nil, nil) chk(status, "clEnqueueWriteImage") // Copy the 7x7 filter to the device status = cl.CLEnqueueWriteBuffer(queue, d_filter, cl.CL_FALSE, 0, filterSize*cl.CL_size_t(unsafe.Sizeof(filter[0])), unsafe.Pointer(&filter[0]), 0, nil, nil) chk(status, "clEnqueueWriteBuffer") // Create the image sampler sampler := cl.CLCreateSampler(context, cl.CL_FALSE, cl.CL_ADDRESS_CLAMP_TO_EDGE, cl.CL_FILTER_NEAREST, &status) chk(status, "clCreateSampler") // Create a program object with source and build it program := utils.Build_program(context, device[:], "convolution.cl", nil) kernel := cl.CLCreateKernel(*program, []byte("convolution"), &status) chk(status, "clCreateKernel") // Set the kernel arguments var w, h, f cl.CL_int w = cl.CL_int(imageWidth) h = cl.CL_int(imageHeight) f = cl.CL_int(filterWidth) status = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(d_inputImage)), unsafe.Pointer(&d_inputImage)) status |= cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(d_outputImage)), unsafe.Pointer(&d_outputImage)) status |= cl.CLSetKernelArg(kernel, 2, cl.CL_size_t(unsafe.Sizeof(h)), unsafe.Pointer(&h)) status |= cl.CLSetKernelArg(kernel, 3, cl.CL_size_t(unsafe.Sizeof(w)), unsafe.Pointer(&w)) status |= cl.CLSetKernelArg(kernel, 4, cl.CL_size_t(unsafe.Sizeof(d_filter)), unsafe.Pointer(&d_filter)) status |= cl.CLSetKernelArg(kernel, 5, cl.CL_size_t(unsafe.Sizeof(f)), unsafe.Pointer(&f)) status |= cl.CLSetKernelArg(kernel, 6, cl.CL_size_t(unsafe.Sizeof(sampler)), unsafe.Pointer(&sampler)) chk(status, "clSetKernelArg") // Set the work item dimensions var globalSize = [2]cl.CL_size_t{imageWidth, imageHeight} status = cl.CLEnqueueNDRangeKernel(queue, kernel, 2, nil, globalSize[:], nil, 0, nil, nil) chk(status, "clEnqueueNDRange") // Read the image back to the host status = cl.CLEnqueueReadImage(queue, d_outputImage, cl.CL_TRUE, origin, region, 0, 0, unsafe.Pointer(&outputImage[0]), 0, nil, nil) chk(status, "clEnqueueReadImage") // Write the output image to file for i = 0; i < imageHeight*imageWidth; i++ { outputpixels[i] = uint16(outputImage[i]) } utils.Write_image_data(outputFile, outputpixels, imageWidth, imageHeight) // Compute the reference image for i = 0; i < imageHeight; i++ { for j = 0; j < imageWidth; j++ { refImage[i*imageWidth+j] = 0 } } // Iterate over the rows of the source image halfFilterWidth := filterWidth / 2 var sum float32 for i = 0; i < imageHeight; i++ { // Iterate over the columns of the source image for j = 0; j < imageWidth; j++ { sum = 0 // Reset sum for new source pixel // Apply the filter to the neighborhood for k := -halfFilterWidth; k <= halfFilterWidth; k++ { for l := -halfFilterWidth; l <= halfFilterWidth; l++ { if i+k >= 0 && i+k < imageHeight && j+l >= 0 && j+l < imageWidth { sum += inputImage[(i+k)*imageWidth+j+l] * filter[(k+halfFilterWidth)*filterWidth+ l+halfFilterWidth] } else { i_k := i + k j_l := j + l if i+k < 0 { i_k = 0 } else if i+k >= imageHeight { i_k = imageHeight - 1 } if j+l < 0 { j_l = 0 } else if j+l >= imageWidth { j_l = imageWidth - 1 } sum += inputImage[(i_k)*imageWidth+j_l] * filter[(k+halfFilterWidth)*filterWidth+ l+halfFilterWidth] } } } refImage[i*imageWidth+j] = sum } } // Write the ref image to file for i = 0; i < imageHeight*imageWidth; i++ { outputpixels[i] = uint16(refImage[i]) } utils.Write_image_data(refFile, outputpixels, imageWidth, imageHeight) failed := 0 for i = 0; i < imageHeight; i++ { for j = 0; j < imageWidth; j++ { if math.Abs(float64(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j])) > 0.01 { //fmt.Printf("Results are INCORRECT\n"); //fmt.Printf("Pixel mismatch at <%d,%d> (%f vs. %f) %f\n", i, j, // outputImage[i*imageWidth+j], refImage[i*imageWidth+j], inputImage[i*imageWidth+j]); failed++ } } } fmt.Printf("Mismatch Pixel number/Total pixel number = %d/%d\n", failed, imageWidth*imageHeight) // Free OpenCL resources cl.CLReleaseKernel(kernel) cl.CLReleaseProgram(*program) cl.CLReleaseCommandQueue(queue) cl.CLReleaseMemObject(d_inputImage) cl.CLReleaseMemObject(d_outputImage) cl.CLReleaseMemObject(d_filter) cl.CLReleaseSampler(sampler) cl.CLReleaseContext(context) }
func TestQueue(t *testing.T) { /* Host/device data structures */ var platforms []ocl.Platform var devices []ocl.Device var context ocl.Context var queue ocl.CommandQueue var err error var ref_count interface{} /* Identify a platform */ if platforms, err = ocl.GetPlatforms(); err != nil { t.Errorf(err.Error()) return } /* Determine connected devices */ if devices, err = platforms[0].GetDevices(cl.CL_DEVICE_TYPE_GPU); err != nil { if devices, err = platforms[0].GetDevices(cl.CL_DEVICE_TYPE_CPU); err != nil { t.Errorf(err.Error()) return } } devices = devices[0:1] /* Create the context */ if context, err = devices[0].CreateContext(nil, nil, nil); err != nil { t.Errorf(err.Error()) return } defer context.Release() /* Create the command queue */ if queue, err = context.CreateCommandQueue(devices[0], nil); err != nil { t.Errorf(err.Error()) return } defer queue.Release() /* Get the reference count */ if ref_count, err = queue.GetInfo(cl.CL_QUEUE_REFERENCE_COUNT); err != nil { t.Errorf(err.Error()) return } t.Logf("Initial reference count: %d\n", ref_count.(cl.CL_uint)) /* Update and display the reference count */ queue.Retain() if ref_count, err = queue.GetInfo(cl.CL_QUEUE_REFERENCE_COUNT); err != nil { t.Errorf(err.Error()) return } t.Logf("Reference count: %d\n", ref_count.(cl.CL_uint)) queue.Release() if ref_count, err = queue.GetInfo(cl.CL_QUEUE_REFERENCE_COUNT); err != nil { t.Errorf(err.Error()) return } t.Logf("Reference count: %d\n", ref_count.(cl.CL_uint)) /* Program/kernel data structures */ var program ocl.Program var program_size [1]cl.CL_size_t var program_buffer [1][]byte var program_log interface{} /* Read each program file and place content into buffer array */ program_handle, err1 := os.Open("blank.cl") if err1 != nil { t.Errorf(err1.Error()) return } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { t.Errorf(err2.Error()) return } program_size[0] = cl.CL_size_t(fi.Size()) program_buffer[0] = make([]byte, program_size[0]) read_size, err3 := program_handle.Read(program_buffer[0]) if err3 != nil || cl.CL_size_t(read_size) != program_size[0] { t.Errorf("read file error or file size wrong") return } // Create program from file if program, err = context.CreateProgramWithSource(1, program_buffer[:], program_size[:]); err != nil { t.Errorf(err.Error()) return } defer program.Release() /* Build program */ if err = program.Build(devices, nil, nil, nil); err != nil { t.Errorf(err.Error()) /* Find size of log and print to std output */ if program_log, err = program.GetBuildInfo(devices[0], cl.CL_PROGRAM_BUILD_LOG); err != nil { t.Errorf(err.Error()) } else { t.Errorf("%s\n", program_log.(string)) } return } //var kernel cl.CL_kernel // /* Create the kernel */ // kernel = cl.CLCreateKernel(program, []byte("blank"), &err) // if err < 0 { // t.Errorf("Couldn't create the kernel") // } // /* Enqueue the kernel execution command */ // err = cl.CLEnqueueTask(queue, kernel, 0, nil, nil) // if err < 0 { // t.Errorf("Couldn't enqueue the kernel execution command") // } else { // t.Logf("Successfully queued kernel.\n") // } //cl.CLReleaseKernel(kernel) }
func main() { // This code executes on the OpenCL host // Host data var size cl.CL_int var A []cl.CL_int //input array var B []cl.CL_int //input array var C []cl.CL_int //output array // Elements in each array const elements = cl.CL_size_t(2048) // Compute the size of the data datasize := cl.CL_size_t(unsafe.Sizeof(size)) * elements // Allocate space for input/output data A = make([]cl.CL_int, datasize) B = make([]cl.CL_int, datasize) C = make([]cl.CL_int, datasize) // Initialize the input data for i := cl.CL_int(0); i < cl.CL_int(elements); i++ { A[i] = i B[i] = i } // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint var platforms []cl.CL_platform_id // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) // Allocate enough space for each platform platforms = make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) if status != cl.CL_SUCCESS { println("CLGetPlatformIDs status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 2: Discover and initialize the devices //----------------------------------------------------- var numDevices cl.CL_uint var devices []cl.CL_device_id // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_ALL, 0, nil, &numDevices) if status != cl.CL_SUCCESS { println("CLGetDeviceIDs status!=cl.CL_SUCCESS") return } // Allocate enough space for each device devices = make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_ALL, numDevices, devices, nil) if status != cl.CL_SUCCESS { println("CLGetDeviceIDs status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- var context cl.CL_context // Create a context using clCreateContext() and // associate it with the devices context = cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) if status != cl.CL_SUCCESS { println("CLCreateContext status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- var cmdQueue cl.CL_command_queue // Create a command queue using clCreateCommandQueue(), // and associate it with the device you want to execute // on cmdQueue = cl.CLCreateCommandQueue(context, devices[0], 0, &status) if status != cl.CL_SUCCESS { println("CLCreateCommandQueue status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- var bufferA cl.CL_mem // Input array on the device var bufferB cl.CL_mem // Input array on the device var bufferC cl.CL_mem // Output array on the device // Use clCreateBuffer() to create a buffer object (d_A) // that will contain the data from the host array A bufferA = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY, datasize, nil, &status) if status != cl.CL_SUCCESS { println("CLCreateBuffer status!=cl.CL_SUCCESS") return } // Use clCreateBuffer() to create a buffer object (d_B) // that will contain the data from the host array B bufferB = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY, datasize, nil, &status) if status != cl.CL_SUCCESS { println("CLCreateBuffer status!=cl.CL_SUCCESS") return } // Use clCreateBuffer() to create a buffer object (d_C) // with enough space to hold the output data bufferC = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, datasize, nil, &status) if status != cl.CL_SUCCESS { println("CLCreateBuffer status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 6: Write host data to device buffers //----------------------------------------------------- // Use clEnqueueWriteBuffer() to write input array A to // the device buffer bufferA status = cl.CLEnqueueWriteBuffer(cmdQueue, bufferA, cl.CL_FALSE, 0, datasize, unsafe.Pointer(&A[0]), 0, nil, nil) if status != cl.CL_SUCCESS { println("CLEnqueueWriteBuffer status!=cl.CL_SUCCESS") return } // Use clEnqueueWriteBuffer() to write input array B to // the device buffer bufferB status = cl.CLEnqueueWriteBuffer(cmdQueue, bufferB, cl.CL_FALSE, 0, datasize, unsafe.Pointer(&B[0]), 0, nil, nil) if status != cl.CL_SUCCESS { println("CLEnqueueWriteBuffer status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 7: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("chapter2.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) if status != cl.CL_SUCCESS { println("CLCreateProgramWithSource status!=cl.CL_SUCCESS") return } // Build (compile) the program for the devices with // clBuildProgram() status = cl.CLBuildProgram(program, numDevices, devices, nil, nil, nil) if status != cl.CL_SUCCESS { println("CLBuildProgram status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 8: Create the kernel //----------------------------------------------------- var kernel cl.CL_kernel // Use clCreateKernel() to create a kernel from the // vector addition function (named "vecadd") kernel = cl.CLCreateKernel(program, []byte("vecadd"), &status) if status != cl.CL_SUCCESS { println("CLCreateKernel status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 9: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() status = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(bufferA)), unsafe.Pointer(&bufferA)) status |= cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(bufferB)), unsafe.Pointer(&bufferB)) status |= cl.CLSetKernelArg(kernel, 2, cl.CL_size_t(unsafe.Sizeof(bufferC)), unsafe.Pointer(&bufferC)) if status != cl.CL_SUCCESS { println("CLSetKernelArg status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 10: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. var globalWorkSize [1]cl.CL_size_t // There are 'elements' work-items globalWorkSize[0] = elements //----------------------------------------------------- // STEP 11: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items status = cl.CLEnqueueNDRangeKernel(cmdQueue, kernel, 1, nil, globalWorkSize[:], nil, 0, nil, nil) if status != cl.CL_SUCCESS { println("CLEnqueueNDRangeKernel status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 12: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) cl.CLEnqueueReadBuffer(cmdQueue, bufferC, cl.CL_TRUE, 0, datasize, unsafe.Pointer(&C[0]), 0, nil, nil) if status != cl.CL_SUCCESS { println("CLEnqueueReadBuffer status!=cl.CL_SUCCESS") return } // Verify the output result := true for i := cl.CL_int(0); i < cl.CL_int(elements); i++ { if C[i] != i+i { result = false break } } if result { println("Output is correct\n") } else { println("Output is incorrect\n") } //----------------------------------------------------- // STEP 13: Release OpenCL resources //----------------------------------------------------- // Free OpenCL resources cl.CLReleaseKernel(kernel) cl.CLReleaseProgram(program) cl.CLReleaseCommandQueue(cmdQueue) cl.CLReleaseMemObject(bufferA) cl.CLReleaseMemObject(bufferB) cl.CLReleaseMemObject(bufferC) cl.CLReleaseContext(context) }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and buffers */ var select1 [4]float32 var select2 [2]cl.CL_uchar var select1_buffer, select2_buffer cl.CL_mem /* Create a context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Create a kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { println("Couldn't create a kernel") return } /* Create a write-only buffer to hold the output data */ select1_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, cl.CL_size_t(unsafe.Sizeof(select1)), nil, &err) if err < 0 { println("Couldn't create a buffer") return } select2_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, cl.CL_size_t(unsafe.Sizeof(select2)), nil, &err) /* Create kernel argument */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(select1_buffer)), unsafe.Pointer(&select1_buffer)) if err < 0 { println("Couldn't set a kernel argument") return } cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(select2_buffer)), unsafe.Pointer(&select2_buffer)) /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { println("Couldn't create a command queue") return } /* Enqueue kernel */ err = cl.CLEnqueueTask(queue, kernel, 0, nil, nil) if err < 0 { println("Couldn't enqueue the kernel") return } /* Read and print the result */ err = cl.CLEnqueueReadBuffer(queue, select1_buffer, cl.CL_TRUE, 0, cl.CL_size_t(unsafe.Sizeof(select1)), unsafe.Pointer(&select1), 0, nil, nil) if err < 0 { println("Couldn't read the buffer") return } cl.CLEnqueueReadBuffer(queue, select2_buffer, cl.CL_TRUE, 0, cl.CL_size_t(unsafe.Sizeof(select2)), unsafe.Pointer(&select2), 0, nil, nil) fmt.Printf("select: ") for i := 0; i < 3; i++ { fmt.Printf("%.2f, ", select1[i]) } fmt.Printf("%.2f\n", select1[3]) fmt.Printf("bitselect: %X, %X\n", select2[0], select2[1]) /* Deallocate resources */ cl.CLReleaseMemObject(select1_buffer) cl.CLReleaseMemObject(select2_buffer) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and buffers */ var a float32 = 6.0 var b float32 = 2.0 var result float32 var a_buffer, b_buffer, output_buffer cl.CL_mem /* Extension data */ var sizeofuint cl.CL_uint var addr_data interface{} var ext_data interface{} fp64_ext := "cl_khr_fp64" var ext_size cl.CL_size_t var options []byte /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Obtain the device data */ if cl.CLGetDeviceInfo(device[0], cl.CL_DEVICE_ADDRESS_BITS, cl.CL_size_t(unsafe.Sizeof(sizeofuint)), &addr_data, nil) < 0 { println("Couldn't read extension data") return } fmt.Printf("Address width: %v\n", addr_data.(cl.CL_uint)) /* Define "FP_64" option if doubles are supported */ cl.CLGetDeviceInfo(device[0], cl.CL_DEVICE_EXTENSIONS, 0, nil, &ext_size) // ext_data = (char*)malloc(ext_size + 1); // ext_data[ext_size] = '\0'; cl.CLGetDeviceInfo(device[0], cl.CL_DEVICE_EXTENSIONS, ext_size, &ext_data, nil) if strings.Contains(ext_data.(string), fp64_ext) { fmt.Printf("The %s extension is supported.\n", fp64_ext) options = []byte("-DFP_64 ") } else { fmt.Printf("The %s extension is not supported. %s\n", fp64_ext, ext_data.(string)) } /* Build the program and create the kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, options) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { println("Couldn't create a kernel") return } /* Create CL buffers to hold input and output data */ a_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(a)), unsafe.Pointer(&a), &err) if err < 0 { println("Couldn't create a memory object") return } b_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(b)), unsafe.Pointer(&b), nil) output_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, cl.CL_size_t(unsafe.Sizeof(b)), nil, nil) /* Create kernel arguments */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(a_buffer)), unsafe.Pointer(&a_buffer)) if err < 0 { println("Couldn't set a kernel argument") return } cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(b_buffer)), unsafe.Pointer(&b_buffer)) cl.CLSetKernelArg(kernel, 2, cl.CL_size_t(unsafe.Sizeof(output_buffer)), unsafe.Pointer(&output_buffer)) /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { println("Couldn't create a command queue") return } /* Enqueue kernel */ err = cl.CLEnqueueTask(queue, kernel, 0, nil, nil) if err < 0 { println("Couldn't enqueue the kernel") return } /* Read and print the result */ err = cl.CLEnqueueReadBuffer(queue, output_buffer, cl.CL_TRUE, 0, cl.CL_size_t(unsafe.Sizeof(result)), unsafe.Pointer(&result), 0, nil, nil) if err < 0 { println("Couldn't read the output buffer") return } fmt.Printf("The kernel result is %f\n", result) /* Deallocate resources */ cl.CLReleaseMemObject(a_buffer) cl.CLReleaseMemObject(b_buffer) cl.CLReleaseMemObject(output_buffer) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func main() { // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint var platforms []cl.CL_platform_id // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) // Allocate enough space for each platform platforms = make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") //----------------------------------------------------- // STEP 2: Discover and initialize the GPU devices //----------------------------------------------------- var numDevices cl.CL_uint var devices []cl.CL_device_id // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") // Allocate enough space for each device devices = make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, numDevices, devices, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- var context cl.CL_context // Create a context using clCreateContext() and // associate it with the devices context = cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext") defer cl.CLReleaseContext(context) //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- var cmdQueue cl.CL_command_queue // Create a command queue using clCreateCommandQueueWithProperties(), // and associate it with the device you want to execute cmdQueue = cl.CLCreateCommandQueueWithProperties(context, devices[0], nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties") defer cl.CLReleaseCommandQueue(cmdQueue) //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- // initialize any device/SVM memory here. /* svm buffer for binary tree */ svmTreeBuf := cl.CLSVMAlloc(context, cl.CL_MEM_READ_WRITE, cl.CL_size_t(NUMBER_OF_NODES*unsafe.Sizeof(sampleNode)), 0) if nil == svmTreeBuf { println("clSVMAlloc(svmTreeBuf) failed.") return } defer cl.CLSVMFree(context, svmTreeBuf) /* svm buffer for search keys */ svmSearchBuf := cl.CLSVMAlloc(context, cl.CL_MEM_READ_WRITE, cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)), 0) if nil == svmSearchBuf { println("clSVMAlloc(svmSearchBuf) failed.") return } defer cl.CLSVMFree(context, svmSearchBuf) //create the binary tree and set the root /* root node of the binary tree */ svmRoot := cpuCreateBinaryTree(cmdQueue, svmTreeBuf) //initialize search keys cpuInitSearchKeys(cmdQueue, svmSearchBuf) /* if voice is not deliberately muzzled, shout parameters */ fmt.Printf("-------------------------------------------------------------------------\n") fmt.Printf("Searching %d keys in a BST having %d Nodes...\n", NUMBER_OF_SEARCH_KEY, NUMBER_OF_NODES) fmt.Printf("-------------------------------------------------------------------------\n") //----------------------------------------------------- // STEP 6: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("bst.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource") defer cl.CLReleaseProgram(program) // Build (compile) the program for the devices with // clBuildProgram() options := "-cl-std=CL2.0" status = cl.CLBuildProgram(program, numDevices, devices, []byte(options), nil, nil) if status != cl.CL_SUCCESS { var program_log interface{} var log_size cl.CL_size_t /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) fmt.Printf("%s\n", program_log) return } //utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram") //----------------------------------------------------- // STEP 7: Create the kernel //----------------------------------------------------- var kernel cl.CL_kernel // Use clCreateKernel() to create a kernel kernel = cl.CLCreateKernel(program, []byte("bst_kernel"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(kernel) //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArgSVMPointer(kernel, 0, svmTreeBuf) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArgSVMPointer(svmTreeBuf)") status = cl.CLSetKernelArgSVMPointer(kernel, 1, svmSearchBuf) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArgSVMPointer(svmSearchBuf)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. var localWorkSize [1]cl.CL_size_t var kernelWorkGroupSize interface{} status = cl.CLGetKernelWorkGroupInfo(kernel, devices[0], cl.CL_KERNEL_WORK_GROUP_SIZE, cl.CL_size_t(unsafe.Sizeof(localWorkSize[0])), &kernelWorkGroupSize, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetKernelWorkGroupInfo") localWorkSize[0] = kernelWorkGroupSize.(cl.CL_size_t) var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = NUMBER_OF_SEARCH_KEY //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items status = cl.CLEnqueueNDRangeKernel(cmdQueue, kernel, 1, nil, globalWorkSize[:], localWorkSize[:], 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel") status = cl.CLFlush(cmdQueue) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush") status = cl.CLFinish(cmdQueue) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish") //----------------------------------------------------- // STEP 11: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) //copy the data back to host buffer //this demo doesn't need clEnqueueReadBuffer due to SVM //----------------------------------------------------- // STEP 12: Verify the results //----------------------------------------------------- // reference implementation svmBinaryTreeCPUReference(cmdQueue, svmRoot, svmTreeBuf, svmSearchBuf) // compare the results and see if they match pass := svmCompareResults(cmdQueue, svmSearchBuf) if pass { println("Passed!") } else { println("Failed!") } }
func svmbasic(size cl.CL_size_t, context cl.CL_context, queue cl.CL_command_queue, kernel cl.CL_kernel) { // Prepare input data as follows. // Build two arrays: // - an array that consists of the Element structures // (refer to svmbasic.h for the structure definition) // - an array that consists of the float values // // Each structure of the first array has the following pointers: // - 'internal', which points to a 'value' field of another entry // of the same array. // - 'external', which points to a float value from the the // second array. // // Pointers are set randomly. The structures do not reflect any real usage // scenario, but are illustrative for a simple device-side traversal. // // Array of Element Array of floats // structures // // ||====================|| // || ............. || ||============|| // || ............. ||<-----+ || .......... || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value <----||------+ || .......... || // ||====================|| | || .......... || // || ............. || | || float || // || ............. || | || float || // ||====================|| | || float || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value || || float || // ||====================|| || float || // || ............. || || .......... || // || ............. || ||============|| // ||====================|| // // The two arrays are created independently and are used to illustrate // two new OpenCL 2.0 API functions: // - the array of Element structures is passed to the kernel as a // kernel argument with the clSetKernelArgSVMPointer function // - the array of floats is used by the kernel indirectly, and this // dependency should be also specified with the clSetKernelExecInfo // function prior to the kernel execution var err cl.CL_int // To enable host & device code to share pointer to the same address space // the arrays should be allocated as SVM memory. Use the clSVMAlloc function // to allocate SVM memory. // // Optionally, this function allows specifying alignment in bytes as its // last argument. As this basic example doesn't require any _special_ alignment, // the following code illustrates requesting default alignment via passing // zero value. inputElements := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER, size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputElements { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputElements) inputFloats := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputFloats { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputFloats) // The OpenCL kernel uses the aforementioned input arrays to compute // values for the output array. output := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_WRITE_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) defer cl.CLSVMFree(context, output) if nil == output { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } // Note: in the coarse-grained SVM, mapping of inputElement and inputFloats is // needed to do the following initialization. While here, in the fine-grained SVM, // it is not necessary. // Populate data-structures with initial data. r := rand.New(rand.NewSource(99)) for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) inputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(i)*unsafe.Sizeof(sampleFloat))) randElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleElement))) randFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleFloat))) inputElement.internal = &(randElement.value) inputElement.external = randFloat inputElement.value = cl.CL_float(i) *inputFloat = cl.CL_float(i + size) } // Note: in the coarse-grained SVM, unmapping of inputElement and inputFloats is // needed before scheduling the kernel for execution. While here, in the fine-grained SVM, // it is not necessary. // Pass arguments to the kernel. // According to the OpenCL 2.0 specification, you need to use a special // function to pass a pointer from SVM memory to kernel. err = cl.CLSetKernelArgSVMPointer(kernel, 0, inputElements) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") err = cl.CLSetKernelArgSVMPointer(kernel, 1, output) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") // For buffer based SVM (both coarse- and fine-grain) if one SVM buffer // points to memory allocated in another SVM buffer, such allocations // should be passed to the kernel via clSetKernelExecInfo. err = cl.CLSetKernelExecInfo(kernel, cl.CL_KERNEL_EXEC_INFO_SVM_PTRS, cl.CL_size_t(unsafe.Sizeof(inputFloats)), unsafe.Pointer(&inputFloats)) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelExecInfo") // Run the kernel. println("Running kernel...") var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = size err = cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, globalWorkSize[:], nil, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel") // Note: In the fine-grained SVM, after enqueuing the kernel above, the host application is // not blocked from accessing SVM allocations that were passed to the kernel. The host // can access the same regions of SVM memory as does the kernel if the kernel and the host // read/modify different bytes. If one side (host or device) needs to modify the same bytes // that are simultaniously read/modified by another side, atomics operations are usually // required to maintain sufficient memory consistency. This sample doesn't use this possibility // and the host just waits in clFinish below until the kernel is finished. err = cl.CLFinish(queue) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLFinish") println(" DONE.") // Validate output state for correctness. // Compare: in the coarse-grained SVM case you need to map the output. // Here it is not needed. println("Checking correctness of the output buffer...") for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) outputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(output) + uintptr(i)*unsafe.Sizeof(sampleFloat))) expectedValue := *(inputElement.internal) + *(inputElement.external) if *outputFloat != expectedValue { println(" FAILED.") fmt.Printf("Mismatch at position %d, read %f, expected %f\n", i, *outputFloat, expectedValue) return } } println(" PASSED.") }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and events */ var data []float32 var data_buffer cl.CL_mem var user_event, kernel_event, read_event [1]cl.CL_event /* Initialize data */ data = make([]float32, 4) for i := 0; i < 4; i++ { data[i] = float32(i) * 1.0 } /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create a kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { println("Couldn't create a kernel") return } /* Create a buffer to hold data */ data_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE|cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(data[0]))*4, unsafe.Pointer(&data[0]), &err) if err < 0 { println("Couldn't create a buffer") return } /* Create kernel argument */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(data_buffer)), unsafe.Pointer(&data_buffer)) if err < 0 { println("Couldn't set a kernel argument") return } /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err) if err < 0 { println("Couldn't create a command queue") return } /* Configure events */ user_event[0] = cl.CLCreateUserEvent(context, &err) if err < 0 { println("Couldn't enqueue the kernel") return } /* Enqueue kernel */ err = cl.CLEnqueueTask(queue, kernel, 1, user_event[:], &kernel_event[0]) if err < 0 { println("Couldn't enqueue the kernel") return } /* Read the buffer */ err = cl.CLEnqueueReadBuffer(queue, data_buffer, cl.CL_FALSE, 0, cl.CL_size_t(unsafe.Sizeof(data[0]))*4, unsafe.Pointer(&data[0]), 1, kernel_event[:], &read_event[0]) if err < 0 { println("Couldn't read the buffer") return } /* Set callback for event */ err = cl.CLSetEventCallback(read_event[0], cl.CL_COMPLETE, read_complete, unsafe.Pointer(&data)) if err < 0 { println("Couldn't set callback for event") return } /* Sleep for a second to demonstrate the that commands haven't started executing. Then prompt user */ time.Sleep(1) fmt.Printf("Old data: %4.2f, %4.2f, %4.2f, %4.2f\n", data[0], data[1], data[2], data[3]) fmt.Printf("Press ENTER to continue.\n") //getchar(); reader := bufio.NewReader(os.Stdin) reader.ReadString('\n') /* Set user event to success */ cl.CLSetUserEventStatus(user_event[0], cl.CL_SUCCESS) /* Deallocate resources */ cl.CLReleaseEvent(read_event[0]) cl.CLReleaseEvent(kernel_event[0]) cl.CLReleaseEvent(user_event[0]) cl.CLReleaseMemObject(data_buffer) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func TestProgram(t *testing.T) { /* Host/device data structures */ var platform [1]cl.CL_platform_id var device [1]cl.CL_device_id var context cl.CL_context var i, err cl.CL_int /* Program data structures */ var program cl.CL_program var program_buffer [NUM_FILES][]byte var program_log interface{} var file_name = []string{"bad.cl", "good.cl"} options := "-cl-finite-math-only -cl-no-signed-zeros" var program_size [NUM_FILES]cl.CL_size_t var log_size cl.CL_size_t /* Access the first installed platform */ err = cl.CLGetPlatformIDs(1, platform[:], nil) if err < 0 { t.Errorf("Couldn't find any platforms") } /* Access the first GPU/CPU */ err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_GPU, 1, device[:], nil) if err == cl.CL_DEVICE_NOT_FOUND { err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_CPU, 1, device[:], nil) } if err < 0 { t.Errorf("Couldn't find any devices") } /* Create a context */ context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { t.Errorf("Couldn't create a context") } /* Read each program file and place content into buffer array */ for i = 0; i < NUM_FILES; i++ { program_handle, err := os.Open(file_name[i]) if err != nil { t.Errorf("Couldn't find the program file") } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { t.Errorf("Couldn't find the program stat") } program_size[i] = cl.CL_size_t(fi.Size()) program_buffer[i] = make([]byte, program_size[i]) read_size, err3 := program_handle.Read(program_buffer[i]) if err3 != nil || cl.CL_size_t(read_size) != program_size[i] { t.Errorf("read file error or file size wrong") } } /* Create a program containing all program content */ program = cl.CLCreateProgramWithSource(context, NUM_FILES, program_buffer[:], program_size[:], &err) if err < 0 { t.Errorf("Couldn't create the program") } /* Build program */ err = cl.CLBuildProgram(program, 1, device[:], []byte(options), nil, nil) if err < 0 { /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) //program_log = (char*) malloc(log_size+1); //program_log[log_size] = '\0'; cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) t.Errorf("%s\n", program_log) //free(program_log); } /* Deallocate resources */ //for(i=0; i<NUM_FILES; i++) { // free(program_buffer[i]); //} cl.CLReleaseProgram(program) cl.CLReleaseContext(context) }
func main() { /* Host/device data structures */ var device []cl.CL_device_id var context cl.CL_context var err cl.CL_int /* Data and buffers */ var main_data [100]float32 var main_buffer, sub_buffer cl.CL_mem var main_buffer_mem, sub_buffer_mem interface{} var main_buffer_size, sub_buffer_size interface{} var buffer_size cl.CL_size_t var buffer_mem cl.CL_ulong var region cl.CL_buffer_region /* Create device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Create a buffer to hold 100 floating-point values */ main_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(main_data)), unsafe.Pointer(&main_data[0]), &err) if err < 0 { println("Couldn't create a buffer") return } /* Create a sub-buffer containing values 30-49 */ region.Origin = 30 * cl.CL_size_t(unsafe.Sizeof(main_data[0])) region.Size = 20 * cl.CL_size_t(unsafe.Sizeof(main_data[0])) fmt.Printf("origin=%d, size=%d\n", region.Origin, region.Size) sub_buffer = cl.CLCreateSubBuffer(main_buffer, cl.CL_MEM_READ_ONLY| cl.CL_MEM_COPY_HOST_PTR, cl.CL_BUFFER_CREATE_TYPE_REGION, unsafe.Pointer(®ion), &err) if err < 0 { fmt.Printf("Couldn't create a sub-buffer, errcode=%d\n", err) return } /* Obtain size information about the buffers */ cl.CLGetMemObjectInfo(main_buffer, cl.CL_MEM_SIZE, cl.CL_size_t(unsafe.Sizeof(buffer_size)), &main_buffer_size, nil) cl.CLGetMemObjectInfo(sub_buffer, cl.CL_MEM_SIZE, cl.CL_size_t(unsafe.Sizeof(buffer_size)), &sub_buffer_size, nil) fmt.Printf("Main buffer size: %v\n", main_buffer_size.(cl.CL_size_t)) fmt.Printf("Sub-buffer size: %v\n", sub_buffer_size.(cl.CL_size_t)) /* Obtain the host pointers */ cl.CLGetMemObjectInfo(main_buffer, cl.CL_MEM_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(buffer_mem)), &main_buffer_mem, nil) cl.CLGetMemObjectInfo(sub_buffer, cl.CL_MEM_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(buffer_mem)), &sub_buffer_mem, nil) fmt.Printf("Main buffer memory address: %v\n", main_buffer_mem.(cl.CL_ulong)) fmt.Printf("Sub-buffer memory address: %v\n", sub_buffer_mem.(cl.CL_ulong)) /* Print the address of the main data */ fmt.Printf("Main array address: %v\n", main_data) /* Deallocate resources */ cl.CLReleaseMemObject(main_buffer) cl.CLReleaseMemObject(sub_buffer) cl.CLReleaseContext(context) }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and events */ var num_ints cl.CL_int var num_items [1]cl.CL_size_t var data [NUM_INTS]cl.CL_int var data_buffer cl.CL_mem var prof_event cl.CL_event var total_time cl.CL_ulong var time_start, time_end interface{} /* Initialize data */ for i := 0; i < NUM_INTS; i++ { data[i] = cl.CL_int(i) } /* Set number of data points and work-items */ num_ints = NUM_INTS num_items[0] = NUM_ITEMS /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create a kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { println("Couldn't create a kernel") return } /* Create a buffer to hold data */ data_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE|cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(data[0]))*NUM_INTS, unsafe.Pointer(&data[0]), &err) if err < 0 { println("Couldn't create a buffer") return } /* Create kernel argument */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(data_buffer)), unsafe.Pointer(&data_buffer)) if err < 0 { println("Couldn't set a kernel argument") return } cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(num_ints)), unsafe.Pointer(&num_ints)) /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], cl.CL_QUEUE_PROFILING_ENABLE, &err) if err < 0 { println("Couldn't create a command queue") return } total_time = 0.0 for i := 0; i < NUM_ITERATIONS; i++ { /* Enqueue kernel */ cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, num_items[:], nil, 0, nil, &prof_event) if err < 0 { println("Couldn't enqueue the kernel") return } /* Finish processing the queue and get profiling information */ cl.CLFinish(queue) cl.CLGetEventProfilingInfo(prof_event, cl.CL_PROFILING_COMMAND_START, cl.CL_size_t(unsafe.Sizeof(total_time)), &time_start, nil) cl.CLGetEventProfilingInfo(prof_event, cl.CL_PROFILING_COMMAND_END, cl.CL_size_t(unsafe.Sizeof(total_time)), &time_end, nil) total_time += time_end.(cl.CL_ulong) - time_start.(cl.CL_ulong) } fmt.Printf("Average time = %v\n", total_time/NUM_ITERATIONS) /* Deallocate resources */ cl.CLReleaseEvent(prof_event) cl.CLReleaseKernel(kernel) cl.CLReleaseMemObject(data_buffer) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int var offset, global_size, local_size [1]cl.CL_size_t /* Data and events */ var data [2]cl.CL_int var data_buffer cl.CL_mem /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create a kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { println("Couldn't create a kernel") return } /* Create a buffer to hold data */ data_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, cl.CL_size_t(unsafe.Sizeof(data[0]))*2, nil, &err) if err < 0 { println("Couldn't create a buffer") return } /* Create kernel argument */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(data_buffer)), unsafe.Pointer(&data_buffer)) if err < 0 { println("Couldn't set a kernel argument") return } /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { println("Couldn't create a command queue") return } /* Enqueue kernel */ offset[0] = 0 global_size[0] = 8 local_size[0] = 4 err = cl.CLEnqueueNDRangeKernel(queue, kernel, 1, offset[:], global_size[:], local_size[:], 0, nil, nil) if err < 0 { println("Couldn't enqueue the kernel") return } /* Read the buffer */ err = cl.CLEnqueueReadBuffer(queue, data_buffer, cl.CL_TRUE, 0, cl.CL_size_t(unsafe.Sizeof(data[0]))*2, unsafe.Pointer(&data[0]), 0, nil, nil) if err < 0 { println("Couldn't read the buffer") return } fmt.Printf("Increment: %d\n", data[0]) fmt.Printf("Atomic increment: %d\n", data[1]) /* Deallocate resources */ cl.CLReleaseMemObject(data_buffer) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func TestMatvec(t *testing.T) { /* Host/device data structures */ var platform [1]cl.CL_platform_id var device [1]cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var i, err cl.CL_int /* Program/kernel data structures */ var program cl.CL_program var program_buffer [1][]byte var program_log interface{} var program_size [1]cl.CL_size_t var log_size cl.CL_size_t var kernel cl.CL_kernel /* Data and buffers */ var mat [16]float32 var vec, result [4]float32 var correct = [4]float32{0.0, 0.0, 0.0, 0.0} var mat_buff, vec_buff, res_buff cl.CL_mem /* Initialize data to be processed by the kernel */ for i = 0; i < 16; i++ { mat[i] = float32(i) * 2.0 } for i = 0; i < 4; i++ { vec[i] = float32(i) * 3.0 correct[0] += mat[i] * vec[i] correct[1] += mat[i+4] * vec[i] correct[2] += mat[i+8] * vec[i] correct[3] += mat[i+12] * vec[i] } /* Identify a platform */ err = cl.CLGetPlatformIDs(1, platform[:], nil) if err < 0 { t.Errorf("Couldn't find any platforms") } /* Access a device */ err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_GPU, 1, device[:], nil) if err < 0 { err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_CPU, 1, device[:], nil) if err < 0 { t.Errorf("Couldn't find any devices") } } /* Create the context */ context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { t.Errorf("Couldn't create a context") } /* Read program file and place content into buffer */ program_handle, err1 := os.Open("matvec.cl") if err1 != nil { t.Errorf("Couldn't find the program file") } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { t.Errorf("Couldn't find the program stat") } program_size[0] = cl.CL_size_t(fi.Size()) program_buffer[0] = make([]byte, program_size[0]) read_size, err3 := program_handle.Read(program_buffer[0]) if err3 != nil || cl.CL_size_t(read_size) != program_size[0] { t.Errorf("read file error or file size wrong") } /* Create a program containing all program content */ program = cl.CLCreateProgramWithSource(context, 1, program_buffer[:], program_size[:], &err) if err < 0 { t.Errorf("Couldn't create the program") } /* Build program */ err = cl.CLBuildProgram(program, 1, device[:], nil, nil, nil) if err < 0 { /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) //program_log = (char*) malloc(log_size+1); //program_log[log_size] = '\0'; cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) t.Errorf("%s\n", program_log) //free(program_log); } /* Create kernel for the mat_vec_mult function */ kernel = cl.CLCreateKernel(program, []byte("matvec_mult"), &err) if err < 0 { t.Errorf("Couldn't create the kernel") return } /* Create CL buffers to hold input and output data */ mat_buff = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(mat)), unsafe.Pointer(&mat[0]), &err) if err < 0 { t.Errorf("Couldn't create a buffer object") return } vec_buff = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(vec)), unsafe.Pointer(&vec[0]), nil) res_buff = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, cl.CL_size_t(unsafe.Sizeof(result)), nil, nil) /* Create kernel arguments from the CL buffers */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(mat_buff)), unsafe.Pointer(&mat_buff)) if err < 0 { t.Errorf("Couldn't set the kernel argument") return } cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(vec_buff)), unsafe.Pointer(&vec_buff)) cl.CLSetKernelArg(kernel, 2, cl.CL_size_t(unsafe.Sizeof(res_buff)), unsafe.Pointer(&res_buff)) /* Create a CL command queue for the device*/ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { t.Errorf("Couldn't create the command queue, errcode=%d\n", err) return } /* Enqueue the command queue to the device */ var work_units_per_kernel = [1]cl.CL_size_t{4} /* 4 work-units per kernel */ err = cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, work_units_per_kernel[:], nil, 0, nil, nil) if err < 0 { t.Errorf("Couldn't enqueue the kernel execution command, errcode=%d\n", err) return } /* Read the result */ err = cl.CLEnqueueReadBuffer(queue, res_buff, cl.CL_TRUE, 0, cl.CL_size_t(unsafe.Sizeof(result)), unsafe.Pointer(&result[0]), 0, nil, nil) if err < 0 { t.Errorf("Couldn't enqueue the read buffer command") return } /* Test the result */ if (result[0] == correct[0]) && (result[1] == correct[1]) && (result[2] == correct[2]) && (result[3] == correct[3]) { t.Logf("Matrix-vector multiplication successful.") } else { t.Errorf("Matrix-vector multiplication unsuccessful.") } /* Deallocate resources */ cl.CLReleaseMemObject(mat_buff) cl.CLReleaseMemObject(vec_buff) cl.CLReleaseMemObject(res_buff) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(program) cl.CLReleaseContext(context) }
func main() { // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") // Allocate enough space for each platform platforms := make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") //----------------------------------------------------- // STEP 2: Discover and initialize the GPU devices //----------------------------------------------------- var numDevices cl.CL_uint // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") // Allocate enough space for each device devices := make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, numDevices, devices, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") var caps cl.CL_device_svm_capabilities var caps_value interface{} status = cl.CLGetDeviceInfo( devices[0], cl.CL_DEVICE_SVM_CAPABILITIES, cl.CL_size_t(unsafe.Sizeof(caps)), &caps_value, nil) caps = caps_value.(cl.CL_device_svm_capabilities) // Coarse-grained buffer SVM should be available on any OpenCL 2.0 device. // So it is either not an OpenCL 2.0 device or it must support coarse-grained buffer SVM: if !(status == cl.CL_SUCCESS && (caps&cl.CL_DEVICE_SVM_FINE_GRAIN_BUFFER) != 0) { fmt.Printf("Cannot detect fine-grained buffer SVM capabilities on the device. The device seemingly doesn't support fine-grained buffer SVM. caps=%x\n", caps) println("") return } //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- // Create a context using clCreateContext() and // associate it with the devices context := cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext") defer cl.CLReleaseContext(context) //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- // Create a command queue using clCreateCommandQueueWithProperties(), // and associate it with the device you want to execute queue := cl.CLCreateCommandQueueWithProperties(context, devices[0], nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties") defer cl.CLReleaseCommandQueue(queue) //----------------------------------------------------- // STEP 5: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("svmfg.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource") defer cl.CLReleaseProgram(program) // Build (compile) the program for the devices with // clBuildProgram() options := "-cl-std=CL2.0" status = cl.CLBuildProgram(program, numDevices, devices, []byte(options), nil, nil) if status != cl.CL_SUCCESS { var log interface{} var log_size cl.CL_size_t /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &log, nil) fmt.Printf("%s\n", log) return } //utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram") //----------------------------------------------------- // STEP 7: Create the kernel //----------------------------------------------------- // Use clCreateKernel() to create a kernel kernel := cl.CLCreateKernel(program, []byte("svmbasic"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(kernel) // Then call the main sample routine - resource allocations, OpenCL kernel // execution, and so on. svmbasic(1024*1024, context, queue, kernel) // All resource deallocations happen in defer. }
func svmBinaryTreeCPUReference(commandQueue cl.CL_command_queue, svmRoot *node, svmTreeBuf unsafe.Pointer, svmSearchBuf unsafe.Pointer) { var status cl.CL_int /* reserve svm buffers for cpu usage */ status = cl.CLEnqueueSVMMap(commandQueue, cl.CL_TRUE, //blocking call cl.CL_MAP_READ, svmTreeBuf, cl.CL_size_t(NUMBER_OF_NODES*unsafe.Sizeof(sampleNode)), 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmTreeBuf)") status = cl.CLEnqueueSVMMap(commandQueue, cl.CL_TRUE, //blocking call cl.CL_MAP_WRITE, svmSearchBuf, cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)), 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmSearchBuf)") for i := 0; i < NUMBER_OF_SEARCH_KEY; i++ { /* search tree */ searchNode := svmRoot currKey := (*searchKey)(unsafe.Pointer(uintptr(svmSearchBuf) + uintptr(i)*unsafe.Sizeof(sampleKey))) for nil != searchNode { if currKey.key == searchNode.value { /* rejoice on finding key */ currKey.nativeNode = searchNode searchNode = nil } else if currKey.key < searchNode.value { /* move left */ searchNode = searchNode.left } else { /* move right */ searchNode = searchNode.right } } } status = cl.CLEnqueueSVMUnmap(commandQueue, svmSearchBuf, 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmSearchBuf)") status = cl.CLEnqueueSVMUnmap(commandQueue, svmTreeBuf, 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmTreeBuf)") }
func main() { // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint var platforms []cl.CL_platform_id // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) // Allocate enough space for each platform platforms = make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") //----------------------------------------------------- // STEP 2: Discover and initialize the GPU devices //----------------------------------------------------- var numDevices cl.CL_uint var devices []cl.CL_device_id // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") // Allocate enough space for each device devices = make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, numDevices, devices, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- var context cl.CL_context // Create a context using clCreateContext() and // associate it with the devices context = cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext") defer cl.CLReleaseContext(context) //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- var commandQueue [MAX_COMMAND_QUEUE]cl.CL_command_queue // Create a command queue using clCreateCommandQueueWithProperties(), // and associate it with the device you want to execute for i := 0; i < MAX_COMMAND_QUEUE; i++ { commandQueue[i] = cl.CLCreateCommandQueueWithProperties(context, devices[0], nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties") defer cl.CLReleaseCommandQueue(commandQueue[i]) } //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- producerGroupSize := cl.CL_size_t(PRODUCER_GROUP_SIZE) producerGlobalSize := cl.CL_size_t(PRODUCER_GLOBAL_SIZE) consumerGroupSize := cl.CL_size_t(CONSUMER_GROUP_SIZE) consumerGlobalSize := cl.CL_size_t(CONSUMER_GLOBAL_SIZE) var samplePipePkt [2]cl.CL_float szPipe := cl.CL_uint(PIPE_SIZE) szPipePkt := cl.CL_uint(unsafe.Sizeof(samplePipePkt)) if szPipe%PRNG_CHANNELS != 0 { szPipe = (szPipe/PRNG_CHANNELS)*PRNG_CHANNELS + PRNG_CHANNELS } consumerGlobalSize = cl.CL_size_t(szPipe) pipePktPerThread := cl.CL_int(szPipe) / PRNG_CHANNELS seed := cl.CL_int(SEED) rngType := cl.CL_int(RV_GAUSSIAN) var histMin cl.CL_float var histMax cl.CL_float if rngType == cl.CL_int(RV_UNIFORM) { histMin = 0.0 histMax = 1.0 } else { histMin = -10.0 histMax = 10.0 } localDevHist := make([]cl.CL_int, MAX_HIST_BINS) cpuHist := make([]cl.CL_int, MAX_HIST_BINS) //Create and initialize memory objects rngPipe := cl.CLCreatePipe(context, cl.CL_MEM_READ_WRITE, szPipePkt, szPipe, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreatePipe") devHist := cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE|cl.CL_MEM_COPY_HOST_PTR, MAX_HIST_BINS*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])), unsafe.Pointer(&localDevHist[0]), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreateBuffer") //----------------------------------------------------- // STEP 6: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("pipe.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource") defer cl.CLReleaseProgram(program) // Build (compile) the program for the devices with // clBuildProgram() options := "-cl-std=CL2.0" status = cl.CLBuildProgram(program, numDevices, devices, []byte(options), nil, nil) if status != cl.CL_SUCCESS { var program_log interface{} var log_size cl.CL_size_t /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) fmt.Printf("%s\n", program_log) return } //utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram") //----------------------------------------------------- // STEP 7: Create the kernel //----------------------------------------------------- // Use clCreateKernel() to create a kernel produceKernel := cl.CLCreateKernel(program, []byte("pipe_producer"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(produceKernel) consumeKernel := cl.CLCreateKernel(program, []byte("pipe_consumer"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(consumeKernel) //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArg(produceKernel, 0, cl.CL_size_t(unsafe.Sizeof(rngPipe)), unsafe.Pointer(&rngPipe)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)") status = cl.CLSetKernelArg(produceKernel, 1, cl.CL_size_t(unsafe.Sizeof(pipePktPerThread)), unsafe.Pointer(&pipePktPerThread)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(pipePktPerThread)") status = cl.CLSetKernelArg(produceKernel, 2, cl.CL_size_t(unsafe.Sizeof(seed)), unsafe.Pointer(&seed)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(seed)") status = cl.CLSetKernelArg(produceKernel, 3, cl.CL_size_t(unsafe.Sizeof(rngType)), unsafe.Pointer(&rngType)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngType)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. // Enqueue both the kernels. var globalThreads = []cl.CL_size_t{producerGlobalSize} var localThreads = []cl.CL_size_t{producerGroupSize} //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items var produceEvt [1]cl.CL_event status = cl.CLEnqueueNDRangeKernel(commandQueue[0], produceKernel, 1, nil, globalThreads, localThreads, 0, nil, &produceEvt[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel") /* launch consumer kernel only after producer has finished. This is done to avoid concurrent kernels execution as the memory consistency of pipe is guaranteed only across synchronization points. */ status = cl.CLWaitForEvents(1, produceEvt[:]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clWaitForEvents(produceEvt)") //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArg(consumeKernel, 0, cl.CL_size_t(unsafe.Sizeof(rngPipe)), unsafe.Pointer(&rngPipe)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)") status = cl.CLSetKernelArg(consumeKernel, 1, cl.CL_size_t(unsafe.Sizeof(devHist)), unsafe.Pointer(&devHist)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(devHist)") status = cl.CLSetKernelArg(consumeKernel, 2, cl.CL_size_t(unsafe.Sizeof(histMin)), unsafe.Pointer(&histMin)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMin)") status = cl.CLSetKernelArg(consumeKernel, 3, cl.CL_size_t(unsafe.Sizeof(histMax)), unsafe.Pointer(&histMax)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMax)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. globalThreads[0] = consumerGlobalSize localThreads[0] = consumerGroupSize //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items var consumeEvt [1]cl.CL_event status = cl.CLEnqueueNDRangeKernel( commandQueue[1], consumeKernel, 1, nil, globalThreads, localThreads, 0, nil, &consumeEvt[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel") status = cl.CLFlush(commandQueue[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(0)") status = cl.CLFlush(commandQueue[1]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(1)") //wait for kernels to finish status = cl.CLFinish(commandQueue[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(0)") status = cl.CLFinish(commandQueue[1]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(1)") //----------------------------------------------------- // STEP 11: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) //copy the data back to host buffer var readEvt cl.CL_event status = cl.CLEnqueueReadBuffer(commandQueue[1], devHist, cl.CL_TRUE, 0, (MAX_HIST_BINS)*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])), unsafe.Pointer(&localDevHist[0]), 0, nil, &readEvt) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueReadBuffer") //----------------------------------------------------- // STEP 12: Verify the results //----------------------------------------------------- //Find the tolerance limit fTol := (float32)(CONSUMER_GLOBAL_SIZE) * (float32)(COMP_TOL) / (float32)(100.0) iTol := (int)(fTol) if iTol == 0 { iTol = 1 } //CPU side histogram computation CPUReference(seed, pipePktPerThread, rngType, cpuHist, histMax, histMin) //Compare for bin := 0; bin < MAX_HIST_BINS; bin++ { diff := int(localDevHist[bin] - cpuHist[bin]) if diff < 0 { diff = -diff } if diff > iTol { println("Failed!") return } } println("Passed!") }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and buffers */ var full_matrix, zero_matrix [80]float32 var sizeoffloat32 = cl.CL_size_t(unsafe.Sizeof(full_matrix[0])) var buffer_origin = [3]cl.CL_size_t{5 * sizeoffloat32, 3, 0} var host_origin = [3]cl.CL_size_t{1 * sizeoffloat32, 1, 0} var region = [3]cl.CL_size_t{4 * sizeoffloat32, 4, 1} var matrix_buffer cl.CL_mem /* Initialize data */ for i := 0; i < 80; i++ { full_matrix[i] = float32(i) * 1.0 zero_matrix[i] = 0.0 } /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create the kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) if program == nil { println("Couldn't build program") return } kernel = cl.CLCreateKernel(*program, []byte(KERNEL_FUNC), &err) if err < 0 { println("Couldn't create a kernel") return } /* Create a buffer to hold 80 floats */ matrix_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(full_matrix)), unsafe.Pointer(&full_matrix[0]), &err) if err < 0 { println("Couldn't create a buffer object") return } /* Set buffer as argument to the kernel */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(matrix_buffer)), unsafe.Pointer(&matrix_buffer)) if err < 0 { println("Couldn't set the buffer as the kernel argument") return } /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { println("Couldn't create a command queue") return } /* Enqueue kernel */ err = cl.CLEnqueueTask(queue, kernel, 0, nil, nil) if err < 0 { println("Couldn't enqueue the kernel") return } /* Enqueue command to write to buffer */ err = cl.CLEnqueueWriteBuffer(queue, matrix_buffer, cl.CL_TRUE, 0, cl.CL_size_t(unsafe.Sizeof(full_matrix)), unsafe.Pointer(&full_matrix[0]), 0, nil, nil) if err < 0 { println("Couldn't write to the buffer object") return } /* Enqueue command to read rectangle of data */ err = cl.CLEnqueueReadBufferRect(queue, matrix_buffer, cl.CL_TRUE, buffer_origin, host_origin, region, 10*sizeoffloat32, 0, 10*sizeoffloat32, 0, unsafe.Pointer(&zero_matrix[0]), 0, nil, nil) if err < 0 { println("Couldn't read the rectangle from the buffer object") return } /* Display updated buffer */ for i := 0; i < 8; i++ { for j := 0; j < 10; j++ { fmt.Printf("%6.1f", zero_matrix[j+i*10]) } fmt.Printf("\n") } /* Deallocate resources */ cl.CLReleaseMemObject(matrix_buffer) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func main() { /* Host/device data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and buffers */ var test [16]byte var test_buffer cl.CL_mem /* Create a context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create a kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { println("Couldn't create a kernel") return } /* Create a write-only buffer to hold the output data */ test_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, cl.CL_size_t(unsafe.Sizeof(test)), nil, &err) if err < 0 { println("Couldn't create a buffer") return } /* Create kernel argument */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(test_buffer)), unsafe.Pointer(&test_buffer)) if err < 0 { println("Couldn't set a kernel argument") return } /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { println("Couldn't create a command queue") return } /* Enqueue kernel */ err = cl.CLEnqueueTask(queue, kernel, 0, nil, nil) if err < 0 { println("Couldn't enqueue the kernel") return } /* Read and print the result */ err = cl.CLEnqueueReadBuffer(queue, test_buffer, cl.CL_TRUE, 0, cl.CL_size_t(unsafe.Sizeof(test)), unsafe.Pointer(&test), 0, nil, nil) if err < 0 { println("Couldn't read the buffer") return } for i := 0; i < 15; i++ { fmt.Printf("0x%X, ", test[i]) } fmt.Printf("0x%X\n", test[15]) /* Deallocate resources */ cl.CLReleaseMemObject(test_buffer) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and buffers */ var data_one, data_two, result_array [100]float32 var buffer_one, buffer_two cl.CL_mem var mapped_memory unsafe.Pointer /* Initialize arrays */ for i := 0; i < 100; i++ { data_one[i] = 1.0 * float32(i) data_two[i] = -1.0 * float32(i) result_array[i] = 0.0 } /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create the kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, []byte(KERNEL_FUNC), &err) if err < 0 { println("Couldn't create a kernel") return } /* Create buffers */ buffer_one = cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(data_one)), unsafe.Pointer(&data_one[0]), &err) if err < 0 { println("Couldn't create buffer object 1") return } buffer_two = cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE| cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(data_two)), unsafe.Pointer(&data_two), &err) if err < 0 { println("Couldn't create buffer object 2") return } /* Set buffers as arguments to the kernel */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(buffer_one)), unsafe.Pointer(&buffer_one)) err |= cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(buffer_two)), unsafe.Pointer(&buffer_two)) if err < 0 { println("Couldn't set the buffer as the kernel argument") return } /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { println("Couldn't create a command queue") return } /* Enqueue kernel */ err = cl.CLEnqueueTask(queue, kernel, 0, nil, nil) if err < 0 { println("Couldn't enqueue the kernel") return } /* Enqueue command to copy buffer one to buffer two */ err = cl.CLEnqueueCopyBuffer(queue, buffer_one, buffer_two, 0, 0, cl.CL_size_t(unsafe.Sizeof(data_one)), 0, nil, nil) if err < 0 { println("Couldn't perform the buffer copy") return } /* Enqueue command to map buffer two to host memory */ mapped_memory = cl.CLEnqueueMapBuffer(queue, buffer_two, cl.CL_TRUE, cl.CL_MAP_READ, 0, cl.CL_size_t(unsafe.Sizeof(data_two)), 0, nil, nil, &err) if err < 0 { println("Couldn't map the buffer to host memory") return } /* Transfer memory and unmap the buffer */ C.memcpy(unsafe.Pointer(&result_array[0]), mapped_memory, C.size_t(unsafe.Sizeof(data_two))) err = cl.CLEnqueueUnmapMemObject(queue, buffer_two, mapped_memory, 0, nil, nil) if err < 0 { println("Couldn't unmap the buffer") return } /* Display updated buffer */ for i := 0; i < 10; i++ { for j := 0; j < 10; j++ { fmt.Printf("%6.1f", result_array[j+i*10]) } fmt.Printf("\n") } /* Deallocate resources */ cl.CLReleaseMemObject(buffer_one) cl.CLReleaseMemObject(buffer_two) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and buffers */ dim := cl.CL_uint(2) var global_offset = [2]cl.CL_size_t{3, 5} var global_size = [2]cl.CL_size_t{6, 4} var local_size = [2]cl.CL_size_t{3, 2} var test [24]float32 var test_buffer cl.CL_mem /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create a kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { println("Couldn't create a kernel") return } /* Create a write-only buffer to hold the output data */ test_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, cl.CL_size_t(unsafe.Sizeof(test)), nil, &err) if err < 0 { println("Couldn't create a buffer") return } /* Create kernel argument */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(test_buffer)), unsafe.Pointer(&test_buffer)) if err < 0 { println("Couldn't set a kernel argument") return } /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { println("Couldn't create a command queue") return } /* Enqueue kernel */ err = cl.CLEnqueueNDRangeKernel(queue, kernel, dim, global_offset[:], global_size[:], local_size[:], 0, nil, nil) if err < 0 { println("Couldn't enqueue the kernel") return } /* Read and print the result */ err = cl.CLEnqueueReadBuffer(queue, test_buffer, cl.CL_TRUE, 0, cl.CL_size_t(unsafe.Sizeof(test)), unsafe.Pointer(&test), 0, nil, nil) if err < 0 { println("Couldn't read the buffer") return } for i := 0; i < 24; i += 6 { fmt.Printf("%.2f %.2f %.2f %.2f %.2f %.2f\n", test[i], test[i+1], test[i+2], test[i+3], test[i+4], test[i+5]) } /* Deallocate resources */ cl.CLReleaseMemObject(test_buffer) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int var err1 error var global_size [2]cl.CL_size_t /* Image data */ var pixels []uint16 var png_format cl.CL_image_format var input_image, output_image cl.CL_mem var origin, region [3]cl.CL_size_t var width, height cl.CL_size_t /* Open input file and read image data */ pixels, width, height, err1 = utils.Read_image_data(INPUT_FILE) if err1 != nil { return } else { fmt.Printf("width=%d, height=%d", width, height) } /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create a kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { fmt.Printf("Couldn't create a kernel: %d", err) return } /* Create image object */ png_format.Image_channel_order = cl.CL_LUMINANCE png_format.Image_channel_data_type = cl.CL_UNORM_INT16 input_image = cl.CLCreateImage2D(context, cl.CL_MEM_READ_ONLY|cl.CL_MEM_COPY_HOST_PTR, &png_format, width, height, 0, unsafe.Pointer(&pixels[0]), &err) output_image = cl.CLCreateImage2D(context, cl.CL_MEM_WRITE_ONLY, &png_format, width, height, 0, nil, &err) if err < 0 { println("Couldn't create the image object") return } /* Create kernel arguments */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(input_image)), unsafe.Pointer(&input_image)) err |= cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(output_image)), unsafe.Pointer(&output_image)) if err < 0 { println("Couldn't set a kernel argument") return } /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { println("Couldn't create a command queue") return } /* Enqueue kernel */ global_size[0] = width global_size[1] = height err = cl.CLEnqueueNDRangeKernel(queue, kernel, 2, nil, global_size[:], nil, 0, nil, nil) if err < 0 { println("Couldn't enqueue the kernel") return } /* Read the image object */ origin[0] = 0 origin[1] = 0 origin[2] = 0 region[0] = width region[1] = height region[2] = 1 err = cl.CLEnqueueReadImage(queue, output_image, cl.CL_TRUE, origin, region, 0, 0, unsafe.Pointer(&pixels[0]), 0, nil, nil) if err < 0 { println("Couldn't read from the image object") return } /* Create output PNG file and write data */ utils.Write_image_data(OUTPUT_FILE, pixels, width, height) /* Deallocate resources */ cl.CLReleaseMemObject(input_image) cl.CLReleaseMemObject(output_image) cl.CLReleaseKernel(kernel) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func TestQueue(t *testing.T) { /* Host/device data structures */ var platform [1]cl.CL_platform_id var device [1]cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var err cl.CL_int /* Program/kernel data structures */ var program cl.CL_program var program_buffer [1][]byte var program_log interface{} var program_size [1]cl.CL_size_t var log_size cl.CL_size_t var kernel cl.CL_kernel /* Access the first installed platform */ err = cl.CLGetPlatformIDs(1, platform[:], nil) if err < 0 { t.Errorf("Couldn't find any platforms") } /* Access the first GPU/CPU */ err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_GPU, 1, device[:], nil) if err == cl.CL_DEVICE_NOT_FOUND { err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_CPU, 1, device[:], nil) } if err < 0 { t.Errorf("Couldn't find any devices") } /* Create a context */ context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { t.Errorf("Couldn't create a context") } /* Read each program file and place content into buffer array */ program_handle, err1 := os.Open("blank.cl") if err1 != nil { t.Errorf("Couldn't find the program file") } defer program_handle.Close() fi, err2 := program_handle.Stat() if err2 != nil { t.Errorf("Couldn't find the program stat") } program_size[0] = cl.CL_size_t(fi.Size()) program_buffer[0] = make([]byte, program_size[0]) read_size, err3 := program_handle.Read(program_buffer[0]) if err3 != nil || cl.CL_size_t(read_size) != program_size[0] { t.Errorf("read file error or file size wrong") } /* Create program from file */ program = cl.CLCreateProgramWithSource(context, 1, program_buffer[:], program_size[:], &err) if err < 0 { t.Errorf("Couldn't create the program") } /* Build program */ err = cl.CLBuildProgram(program, 1, device[:], nil, nil, nil) if err < 0 { /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) //program_log = (char*) malloc(log_size+1); //program_log[log_size] = '\0'; cl.CLGetProgramBuildInfo(program, device[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) t.Errorf("%s\n", program_log) //free(program_log); } /* Create the kernel */ kernel = cl.CLCreateKernel(program, []byte("blank"), &err) if err < 0 { t.Errorf("Couldn't create the kernel") } /* Create the command queue */ queue = cl.CLCreateCommandQueue(context, device[0], 0, &err) if err < 0 { t.Errorf("Couldn't create the command queue") } /* Enqueue the kernel execution command */ err = cl.CLEnqueueTask(queue, kernel, 0, nil, nil) if err < 0 { t.Errorf("Couldn't enqueue the kernel execution command") } else { t.Logf("Successfully queued kernel.\n") } /* Deallocate resources */ cl.CLReleaseCommandQueue(queue) cl.CLReleaseKernel(kernel) cl.CLReleaseProgram(program) cl.CLReleaseContext(context) }
func svmbasic(size cl.CL_size_t, context cl.CL_context, queue cl.CL_command_queue, kernel cl.CL_kernel) { // Prepare input data as follows. // Build two arrays: // - an array that consists of the Element structures // (refer to svmbasic.h for the structure definition) // - an array that consists of the float values // // Each structure of the first array has the following pointers: // - 'internal', which points to a 'value' field of another entry // of the same array. // - 'external', which points to a float value from the the // second array. // // Pointers are set randomly. The structures do not reflect any real usage // scenario, but are illustrative for a simple device-side traversal. // // Array of Element Array of floats // structures // // ||====================|| // || ............. || ||============|| // || ............. ||<-----+ || .......... || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value <----||------+ || .......... || // ||====================|| | || .......... || // || ............. || | || float || // || ............. || | || float || // ||====================|| | || float || // ||====================|| | || float || // || float* internal--||------+ || float || // || float* external--||------------------>|| float || // || float value || || float || // ||====================|| || float || // || ............. || || .......... || // || ............. || ||============|| // ||====================|| // // The two arrays are created independently and are used to illustrate // two new OpenCL 2.0 API functions: // - the array of Element structures is passed to the kernel as a // kernel argument with the clSetKernelArgSVMPointer function // - the array of floats is used by the kernel indirectly, and this // dependency should be also specified with the clSetKernelExecInfo // function prior to the kernel execution var err cl.CL_int // To enable host & device code to share pointer to the same address space // the arrays should be allocated as SVM memory. Use the clSVMAlloc function // to allocate SVM memory. // // Optionally, this function allows specifying alignment in bytes as its // last argument. As this basic example doesn't require any _special_ alignment, // the following code illustrates requesting default alignment via passing // zero value. inputElements := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY, size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputElements { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputElements) inputFloats := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_READ_ONLY, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) if nil == inputFloats { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } defer cl.CLSVMFree(context, inputFloats) // The OpenCL kernel uses the aforementioned input arrays to compute // values for the output array. output := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used cl.CL_MEM_WRITE_ONLY, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes) 0) // alignment in bytes (0 means default) defer cl.CLSVMFree(context, output) if nil == output { println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.") return } // In the coarse-grained buffer SVM model, only one OpenCL device (or // host) can have ownership for writing to the buffer. Specifically, host // explicitly requests the ownership by mapping/unmapping the SVM buffer. // // So to fill the input SVM buffers on the host, you need to map them to have // access from the host program. // // The following two map calls are required in case of coarse-grained SVM only. err = cl.CLEnqueueSVMMap(queue, cl.CL_TRUE, // blocking map cl.CL_MAP_WRITE, inputElements, size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap") err = cl.CLEnqueueSVMMap(queue, cl.CL_TRUE, // blocking map cl.CL_MAP_WRITE, inputFloats, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap") // Populate data-structures with initial data. r := rand.New(rand.NewSource(99)) for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) inputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(i)*unsafe.Sizeof(sampleFloat))) randElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleElement))) randFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleFloat))) inputElement.internal = &(randElement.value) inputElement.external = randFloat inputElement.value = cl.CL_float(i) *inputFloat = cl.CL_float(i + size) } // The following two unmap calls are required in case of coarse-grained SVM only err = cl.CLEnqueueSVMUnmap(queue, inputElements, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap") err = cl.CLEnqueueSVMUnmap(queue, inputFloats, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap") // Pass arguments to the kernel. // According to the OpenCL 2.0 specification, you need to use a special // function to pass a pointer from SVM memory to kernel. err = cl.CLSetKernelArgSVMPointer(kernel, 0, inputElements) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") err = cl.CLSetKernelArgSVMPointer(kernel, 1, output) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer") // For buffer based SVM (both coarse- and fine-grain) if one SVM buffer // points to memory allocated in another SVM buffer, such allocations // should be passed to the kernel via clSetKernelExecInfo. err = cl.CLSetKernelExecInfo(kernel, cl.CL_KERNEL_EXEC_INFO_SVM_PTRS, cl.CL_size_t(unsafe.Sizeof(inputFloats)), unsafe.Pointer(&inputFloats)) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelExecInfo") // Run the kernel. println("Running kernel...") var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = size err = cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, globalWorkSize[:], nil, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel") // Map the output SVM buffer to read the results. // Mapping is required for coarse-grained SVM only. err = cl.CLEnqueueSVMMap(queue, cl.CL_TRUE, // blocking map cl.CL_MAP_READ, output, size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap") println(" DONE.") // Validate output state for correctness. println("Checking correctness of the output buffer...") for i := cl.CL_size_t(0); i < size; i++ { inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement))) outputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(output) + uintptr(i)*unsafe.Sizeof(sampleFloat))) expectedValue := *(inputElement.internal) + *(inputElement.external) if *outputFloat != expectedValue { println(" FAILED.") fmt.Printf("Mismatch at position %d, read %f, expected %f\n", i, *outputFloat, expectedValue) return } } println(" PASSED.") err = cl.CLEnqueueSVMUnmap(queue, output, 0, nil, nil) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap") err = cl.CLFinish(queue) utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLFinish") }