func cpuInitSearchKeys(commandQueue cl.CL_command_queue, svmSearchBuf unsafe.Pointer) { var nextData *searchKey var status cl.CL_int status = cl.CLEnqueueSVMMap(commandQueue, cl.CL_TRUE, //blocking call cl.CL_MAP_WRITE_INVALIDATE_REGION, svmSearchBuf, cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)), 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmSearchBuf)") r := rand.New(rand.NewSource(999)) // initialize nodes for i := 0; i < NUMBER_OF_SEARCH_KEY; i++ { nextData = (*searchKey)(unsafe.Pointer(uintptr(svmSearchBuf) + uintptr(i)*unsafe.Sizeof(sampleKey))) // allocate a random value to node nextData.key = cl.CL_int(r.Int()) // all pointers are null nextData.oclNode = nil nextData.nativeNode = nil } status = cl.CLEnqueueSVMUnmap(commandQueue, svmSearchBuf, 0, nil, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmSearchBuf)") }
func cpuMakeBinaryTree(svmTreeBuf unsafe.Pointer) *node { var root *node var nextData *node var nextNode *node var insertedFlag bool r := rand.New(rand.NewSource(99)) // initialize nodes for i := 0; i < NUMBER_OF_NODES; i++ { nextData = (*node)(unsafe.Pointer(uintptr(svmTreeBuf) + uintptr(i)*unsafe.Sizeof(sampleNode))) // allocate a random value to node nextData.value = cl.CL_int(r.Int()) // all pointers are null nextData.left = nil nextData.right = nil } // allocate first node to root root = (*node)(svmTreeBuf) // iterative tree insert for i := 1; i < NUMBER_OF_NODES; i++ { nextData = (*node)(unsafe.Pointer(uintptr(svmTreeBuf) + uintptr(i)*unsafe.Sizeof(sampleNode))) nextNode = root insertedFlag = false for false == insertedFlag { if nextData.value <= nextNode.value { // move left if nil == nextNode.left { nextNode.left = nextData insertedFlag = true } else { nextNode = nextNode.left } } else { // move right if nil == nextNode.right { nextNode.right = nextData insertedFlag = true } else { nextNode = nextNode.right } } } } return root }
func main() { // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint var platforms []cl.CL_platform_id // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) // Allocate enough space for each platform platforms = make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs") //----------------------------------------------------- // STEP 2: Discover and initialize the GPU devices //----------------------------------------------------- var numDevices cl.CL_uint var devices []cl.CL_device_id // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, 0, nil, &numDevices) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") // Allocate enough space for each device devices = make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_GPU, numDevices, devices, nil) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs") //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- var context cl.CL_context // Create a context using clCreateContext() and // associate it with the devices context = cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext") defer cl.CLReleaseContext(context) //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- var commandQueue [MAX_COMMAND_QUEUE]cl.CL_command_queue // Create a command queue using clCreateCommandQueueWithProperties(), // and associate it with the device you want to execute for i := 0; i < MAX_COMMAND_QUEUE; i++ { commandQueue[i] = cl.CLCreateCommandQueueWithProperties(context, devices[0], nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties") defer cl.CLReleaseCommandQueue(commandQueue[i]) } //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- producerGroupSize := cl.CL_size_t(PRODUCER_GROUP_SIZE) producerGlobalSize := cl.CL_size_t(PRODUCER_GLOBAL_SIZE) consumerGroupSize := cl.CL_size_t(CONSUMER_GROUP_SIZE) consumerGlobalSize := cl.CL_size_t(CONSUMER_GLOBAL_SIZE) var samplePipePkt [2]cl.CL_float szPipe := cl.CL_uint(PIPE_SIZE) szPipePkt := cl.CL_uint(unsafe.Sizeof(samplePipePkt)) if szPipe%PRNG_CHANNELS != 0 { szPipe = (szPipe/PRNG_CHANNELS)*PRNG_CHANNELS + PRNG_CHANNELS } consumerGlobalSize = cl.CL_size_t(szPipe) pipePktPerThread := cl.CL_int(szPipe) / PRNG_CHANNELS seed := cl.CL_int(SEED) rngType := cl.CL_int(RV_GAUSSIAN) var histMin cl.CL_float var histMax cl.CL_float if rngType == cl.CL_int(RV_UNIFORM) { histMin = 0.0 histMax = 1.0 } else { histMin = -10.0 histMax = 10.0 } localDevHist := make([]cl.CL_int, MAX_HIST_BINS) cpuHist := make([]cl.CL_int, MAX_HIST_BINS) //Create and initialize memory objects rngPipe := cl.CLCreatePipe(context, cl.CL_MEM_READ_WRITE, szPipePkt, szPipe, nil, &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreatePipe") devHist := cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE|cl.CL_MEM_COPY_HOST_PTR, MAX_HIST_BINS*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])), unsafe.Pointer(&localDevHist[0]), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreateBuffer") //----------------------------------------------------- // STEP 6: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("pipe.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource") defer cl.CLReleaseProgram(program) // Build (compile) the program for the devices with // clBuildProgram() options := "-cl-std=CL2.0" status = cl.CLBuildProgram(program, numDevices, devices, []byte(options), nil, nil) if status != cl.CL_SUCCESS { var program_log interface{} var log_size cl.CL_size_t /* Find size of log and print to std output */ cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size) cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &program_log, nil) fmt.Printf("%s\n", program_log) return } //utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram") //----------------------------------------------------- // STEP 7: Create the kernel //----------------------------------------------------- // Use clCreateKernel() to create a kernel produceKernel := cl.CLCreateKernel(program, []byte("pipe_producer"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(produceKernel) consumeKernel := cl.CLCreateKernel(program, []byte("pipe_consumer"), &status) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel") defer cl.CLReleaseKernel(consumeKernel) //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArg(produceKernel, 0, cl.CL_size_t(unsafe.Sizeof(rngPipe)), unsafe.Pointer(&rngPipe)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)") status = cl.CLSetKernelArg(produceKernel, 1, cl.CL_size_t(unsafe.Sizeof(pipePktPerThread)), unsafe.Pointer(&pipePktPerThread)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(pipePktPerThread)") status = cl.CLSetKernelArg(produceKernel, 2, cl.CL_size_t(unsafe.Sizeof(seed)), unsafe.Pointer(&seed)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(seed)") status = cl.CLSetKernelArg(produceKernel, 3, cl.CL_size_t(unsafe.Sizeof(rngType)), unsafe.Pointer(&rngType)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngType)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. // Enqueue both the kernels. var globalThreads = []cl.CL_size_t{producerGlobalSize} var localThreads = []cl.CL_size_t{producerGroupSize} //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items var produceEvt [1]cl.CL_event status = cl.CLEnqueueNDRangeKernel(commandQueue[0], produceKernel, 1, nil, globalThreads, localThreads, 0, nil, &produceEvt[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel") /* launch consumer kernel only after producer has finished. This is done to avoid concurrent kernels execution as the memory consistency of pipe is guaranteed only across synchronization points. */ status = cl.CLWaitForEvents(1, produceEvt[:]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clWaitForEvents(produceEvt)") //----------------------------------------------------- // STEP 8: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() // Set appropriate arguments to the kernel status = cl.CLSetKernelArg(consumeKernel, 0, cl.CL_size_t(unsafe.Sizeof(rngPipe)), unsafe.Pointer(&rngPipe)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)") status = cl.CLSetKernelArg(consumeKernel, 1, cl.CL_size_t(unsafe.Sizeof(devHist)), unsafe.Pointer(&devHist)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(devHist)") status = cl.CLSetKernelArg(consumeKernel, 2, cl.CL_size_t(unsafe.Sizeof(histMin)), unsafe.Pointer(&histMin)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMin)") status = cl.CLSetKernelArg(consumeKernel, 3, cl.CL_size_t(unsafe.Sizeof(histMax)), unsafe.Pointer(&histMax)) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMax)") //----------------------------------------------------- // STEP 9: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. globalThreads[0] = consumerGlobalSize localThreads[0] = consumerGroupSize //----------------------------------------------------- // STEP 10: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items var consumeEvt [1]cl.CL_event status = cl.CLEnqueueNDRangeKernel( commandQueue[1], consumeKernel, 1, nil, globalThreads, localThreads, 0, nil, &consumeEvt[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel") status = cl.CLFlush(commandQueue[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(0)") status = cl.CLFlush(commandQueue[1]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(1)") //wait for kernels to finish status = cl.CLFinish(commandQueue[0]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(0)") status = cl.CLFinish(commandQueue[1]) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(1)") //----------------------------------------------------- // STEP 11: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) //copy the data back to host buffer var readEvt cl.CL_event status = cl.CLEnqueueReadBuffer(commandQueue[1], devHist, cl.CL_TRUE, 0, (MAX_HIST_BINS)*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])), unsafe.Pointer(&localDevHist[0]), 0, nil, &readEvt) utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueReadBuffer") //----------------------------------------------------- // STEP 12: Verify the results //----------------------------------------------------- //Find the tolerance limit fTol := (float32)(CONSUMER_GLOBAL_SIZE) * (float32)(COMP_TOL) / (float32)(100.0) iTol := (int)(fTol) if iTol == 0 { iTol = 1 } //CPU side histogram computation CPUReference(seed, pipePktPerThread, rngType, cpuHist, histMax, histMin) //Compare for bin := 0; bin < MAX_HIST_BINS; bin++ { diff := int(localDevHist[bin] - cpuHist[bin]) if diff < 0 { diff = -diff } if diff > iTol { println("Failed!") return } } println("Passed!") }
func CPUReference(seed cl.CL_int, pipePktPerThread cl.CL_int, rngType cl.CL_int, cpuHist []cl.CL_int, histMax, histMin cl.CL_float) { var pmPRNG PM_PRNG var irn [PRNG_CHANNELS][2]cl.CL_int var frn [PRNG_CHANNELS][2]cl.CL_float var grn [PRNG_CHANNELS][2]cl.CL_float var binWidth cl.CL_float // Initialize the prng pmPRNG.rngInit(seed) // Put starting values for each channel for ch := cl.CL_int(0); ch < cl.CL_int(PRNG_CHANNELS); ch++ { irn[ch][0] = (ch + 1) * (ch + 1) irn[ch][1] = (ch + 1) * (ch + 1) } //compute binWidth binWidth = (histMax - histMin) / cl.CL_float(MAX_HIST_BINS) for pkt := cl.CL_int(0); pkt < pipePktPerThread; pkt++ { // Generate random numbers for ch := 0; ch < PRNG_CHANNELS; ch++ { irn[ch][0] = pmPRNG.rngPM(irn[ch][1], cl.CL_int(ch)) irn[ch][1] = pmPRNG.rngPM(irn[ch][0], cl.CL_int(ch)) frn[ch][0] = cl.CL_float(irn[ch][0]) * AM if frn[ch][0] > RMAX { frn[ch][0] = cl.CL_float(RMAX) } frn[ch][1] = cl.CL_float(irn[ch][1]) * AM if frn[ch][1] > RMAX { frn[ch][1] = cl.CL_float(RMAX) } if rngType == RV_GAUSSIAN { grn[ch] = boxMuller(frn[ch]) } else { grn[ch] = frn[ch] } } // host side histogram for ch := 0; ch < PRNG_CHANNELS; ch++ { rmin := histMin rmax := rmin + binWidth found := 0 for bindex := 0; (bindex < MAX_HIST_BINS) && (found != 2); bindex++ { if (grn[ch][0] >= rmin) && (grn[ch][0] < rmax) { found += 1 cpuHist[bindex] += 1 } if (grn[ch][1] >= rmin) && (grn[ch][1] < rmax) { found += 1 cpuHist[bindex] += 1 } rmin = rmax rmax = rmin + binWidth } } } }
func main() { var i, j cl.CL_size_t // Rows and columns in the input image inputFile := "test.png" outputFile := "output.png" refFile := "ref.png" // Homegrown function to read a BMP from file inputpixels, imageWidth, imageHeight, err1 := utils.Read_image_data(inputFile) if err1 != nil { log.Fatal(err1) return } else { fmt.Printf("width=%d, height=%d (%d)\n", imageWidth, imageHeight, inputpixels[0]) } // Output image on the host outputpixels := make([]uint16, imageHeight*imageWidth) inputImage := make([]float32, imageHeight*imageWidth) outputImage := make([]float32, imageHeight*imageWidth) refImage := make([]float32, imageHeight*imageWidth) for i = 0; i < imageHeight*imageWidth; i++ { inputImage[i] = float32(inputpixels[i]) } // 45 degree motion blur var filter = [49]float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0, 0, -2, 0, 2, 0, 0, 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} // The convolution filter is 7x7 filterWidth := cl.CL_size_t(7) filterSize := cl.CL_size_t(filterWidth * filterWidth) // Assume a square kernel // Set up the OpenCL environment var status cl.CL_int // Discovery platform var platform [1]cl.CL_platform_id status = cl.CLGetPlatformIDs(1, platform[:], nil) chk(status, "clGetPlatformIDs") // Discover device var device [1]cl.CL_device_id cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_ALL, 1, device[:], nil) chk(status, "clGetDeviceIDs") // Create context //var props =[3]cl.CL_context_properties{cl.CL_CONTEXT_PLATFORM, // (cl.CL_context_properties)(unsafe.Pointer(&platform[0])), 0}; var context cl.CL_context context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &status) chk(status, "clCreateContext") // Create command queue var queue cl.CL_command_queue queue = cl.CLCreateCommandQueue(context, device[0], 0, &status) chk(status, "clCreateCommandQueue") // The image format describes how the data will be stored in memory var format cl.CL_image_format format.Image_channel_order = cl.CL_R // single channel format.Image_channel_data_type = cl.CL_FLOAT // float data type var desc cl.CL_image_desc desc.Image_type = cl.CL_MEM_OBJECT_IMAGE2D desc.Image_width = imageWidth desc.Image_height = imageHeight desc.Image_depth = 0 desc.Image_array_size = 0 desc.Image_row_pitch = 0 desc.Image_slice_pitch = 0 desc.Num_mip_levels = 0 desc.Num_samples = 0 desc.Buffer = cl.CL_mem{} // Create space for the source image on the device d_inputImage := cl.CLCreateImage(context, cl.CL_MEM_READ_ONLY, &format, &desc, nil, &status) chk(status, "clCreateImage") // Create space for the output image on the device d_outputImage := cl.CLCreateImage(context, cl.CL_MEM_WRITE_ONLY, &format, &desc, nil, &status) chk(status, "clCreateImage") // Create space for the 7x7 filter on the device d_filter := cl.CLCreateBuffer(context, 0, filterSize*cl.CL_size_t(unsafe.Sizeof(filter[0])), nil, &status) chk(status, "clCreateBuffer") // Copy the source image to the device var origin = [3]cl.CL_size_t{0, 0, 0} // Offset within the image to copy from var region = [3]cl.CL_size_t{cl.CL_size_t(imageWidth), cl.CL_size_t(imageHeight), 1} // Elements to per dimension status = cl.CLEnqueueWriteImage(queue, d_inputImage, cl.CL_FALSE, origin, region, 0, 0, unsafe.Pointer(&inputImage[0]), 0, nil, nil) chk(status, "clEnqueueWriteImage") // Copy the 7x7 filter to the device status = cl.CLEnqueueWriteBuffer(queue, d_filter, cl.CL_FALSE, 0, filterSize*cl.CL_size_t(unsafe.Sizeof(filter[0])), unsafe.Pointer(&filter[0]), 0, nil, nil) chk(status, "clEnqueueWriteBuffer") // Create the image sampler sampler := cl.CLCreateSampler(context, cl.CL_FALSE, cl.CL_ADDRESS_CLAMP_TO_EDGE, cl.CL_FILTER_NEAREST, &status) chk(status, "clCreateSampler") // Create a program object with source and build it program := utils.Build_program(context, device[:], "convolution.cl", nil) kernel := cl.CLCreateKernel(*program, []byte("convolution"), &status) chk(status, "clCreateKernel") // Set the kernel arguments var w, h, f cl.CL_int w = cl.CL_int(imageWidth) h = cl.CL_int(imageHeight) f = cl.CL_int(filterWidth) status = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(d_inputImage)), unsafe.Pointer(&d_inputImage)) status |= cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(d_outputImage)), unsafe.Pointer(&d_outputImage)) status |= cl.CLSetKernelArg(kernel, 2, cl.CL_size_t(unsafe.Sizeof(h)), unsafe.Pointer(&h)) status |= cl.CLSetKernelArg(kernel, 3, cl.CL_size_t(unsafe.Sizeof(w)), unsafe.Pointer(&w)) status |= cl.CLSetKernelArg(kernel, 4, cl.CL_size_t(unsafe.Sizeof(d_filter)), unsafe.Pointer(&d_filter)) status |= cl.CLSetKernelArg(kernel, 5, cl.CL_size_t(unsafe.Sizeof(f)), unsafe.Pointer(&f)) status |= cl.CLSetKernelArg(kernel, 6, cl.CL_size_t(unsafe.Sizeof(sampler)), unsafe.Pointer(&sampler)) chk(status, "clSetKernelArg") // Set the work item dimensions var globalSize = [2]cl.CL_size_t{imageWidth, imageHeight} status = cl.CLEnqueueNDRangeKernel(queue, kernel, 2, nil, globalSize[:], nil, 0, nil, nil) chk(status, "clEnqueueNDRange") // Read the image back to the host status = cl.CLEnqueueReadImage(queue, d_outputImage, cl.CL_TRUE, origin, region, 0, 0, unsafe.Pointer(&outputImage[0]), 0, nil, nil) chk(status, "clEnqueueReadImage") // Write the output image to file for i = 0; i < imageHeight*imageWidth; i++ { outputpixels[i] = uint16(outputImage[i]) } utils.Write_image_data(outputFile, outputpixels, imageWidth, imageHeight) // Compute the reference image for i = 0; i < imageHeight; i++ { for j = 0; j < imageWidth; j++ { refImage[i*imageWidth+j] = 0 } } // Iterate over the rows of the source image halfFilterWidth := filterWidth / 2 var sum float32 for i = 0; i < imageHeight; i++ { // Iterate over the columns of the source image for j = 0; j < imageWidth; j++ { sum = 0 // Reset sum for new source pixel // Apply the filter to the neighborhood for k := -halfFilterWidth; k <= halfFilterWidth; k++ { for l := -halfFilterWidth; l <= halfFilterWidth; l++ { if i+k >= 0 && i+k < imageHeight && j+l >= 0 && j+l < imageWidth { sum += inputImage[(i+k)*imageWidth+j+l] * filter[(k+halfFilterWidth)*filterWidth+ l+halfFilterWidth] } else { i_k := i + k j_l := j + l if i+k < 0 { i_k = 0 } else if i+k >= imageHeight { i_k = imageHeight - 1 } if j+l < 0 { j_l = 0 } else if j+l >= imageWidth { j_l = imageWidth - 1 } sum += inputImage[(i_k)*imageWidth+j_l] * filter[(k+halfFilterWidth)*filterWidth+ l+halfFilterWidth] } } } refImage[i*imageWidth+j] = sum } } // Write the ref image to file for i = 0; i < imageHeight*imageWidth; i++ { outputpixels[i] = uint16(refImage[i]) } utils.Write_image_data(refFile, outputpixels, imageWidth, imageHeight) failed := 0 for i = 0; i < imageHeight; i++ { for j = 0; j < imageWidth; j++ { if math.Abs(float64(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j])) > 0.01 { //fmt.Printf("Results are INCORRECT\n"); //fmt.Printf("Pixel mismatch at <%d,%d> (%f vs. %f) %f\n", i, j, // outputImage[i*imageWidth+j], refImage[i*imageWidth+j], inputImage[i*imageWidth+j]); failed++ } } } fmt.Printf("Mismatch Pixel number/Total pixel number = %d/%d\n", failed, imageWidth*imageHeight) // Free OpenCL resources cl.CLReleaseKernel(kernel) cl.CLReleaseProgram(*program) cl.CLReleaseCommandQueue(queue) cl.CLReleaseMemObject(d_inputImage) cl.CLReleaseMemObject(d_outputImage) cl.CLReleaseMemObject(d_filter) cl.CLReleaseSampler(sampler) cl.CLReleaseContext(context) }
func main() { // This code executes on the OpenCL host // Host data var size cl.CL_int var A []cl.CL_int //input array var B []cl.CL_int //input array var C []cl.CL_int //output array // Elements in each array const elements = cl.CL_size_t(2048) // Compute the size of the data datasize := cl.CL_size_t(unsafe.Sizeof(size)) * elements // Allocate space for input/output data A = make([]cl.CL_int, datasize) B = make([]cl.CL_int, datasize) C = make([]cl.CL_int, datasize) // Initialize the input data for i := cl.CL_int(0); i < cl.CL_int(elements); i++ { A[i] = i B[i] = i } // Use this to check the output of each API call var status cl.CL_int //----------------------------------------------------- // STEP 1: Discover and initialize the platforms //----------------------------------------------------- var numPlatforms cl.CL_uint var platforms []cl.CL_platform_id // Use clGetPlatformIDs() to retrieve the number of // platforms status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) // Allocate enough space for each platform platforms = make([]cl.CL_platform_id, numPlatforms) // Fill in platforms with clGetPlatformIDs() status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil) if status != cl.CL_SUCCESS { println("CLGetPlatformIDs status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 2: Discover and initialize the devices //----------------------------------------------------- var numDevices cl.CL_uint var devices []cl.CL_device_id // Use clGetDeviceIDs() to retrieve the number of // devices present status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_ALL, 0, nil, &numDevices) if status != cl.CL_SUCCESS { println("CLGetDeviceIDs status!=cl.CL_SUCCESS") return } // Allocate enough space for each device devices = make([]cl.CL_device_id, numDevices) // Fill in devices with clGetDeviceIDs() status = cl.CLGetDeviceIDs(platforms[0], cl.CL_DEVICE_TYPE_ALL, numDevices, devices, nil) if status != cl.CL_SUCCESS { println("CLGetDeviceIDs status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 3: Create a context //----------------------------------------------------- var context cl.CL_context // Create a context using clCreateContext() and // associate it with the devices context = cl.CLCreateContext(nil, numDevices, devices, nil, nil, &status) if status != cl.CL_SUCCESS { println("CLCreateContext status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 4: Create a command queue //----------------------------------------------------- var cmdQueue cl.CL_command_queue // Create a command queue using clCreateCommandQueue(), // and associate it with the device you want to execute // on cmdQueue = cl.CLCreateCommandQueue(context, devices[0], 0, &status) if status != cl.CL_SUCCESS { println("CLCreateCommandQueue status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 5: Create device buffers //----------------------------------------------------- var bufferA cl.CL_mem // Input array on the device var bufferB cl.CL_mem // Input array on the device var bufferC cl.CL_mem // Output array on the device // Use clCreateBuffer() to create a buffer object (d_A) // that will contain the data from the host array A bufferA = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY, datasize, nil, &status) if status != cl.CL_SUCCESS { println("CLCreateBuffer status!=cl.CL_SUCCESS") return } // Use clCreateBuffer() to create a buffer object (d_B) // that will contain the data from the host array B bufferB = cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY, datasize, nil, &status) if status != cl.CL_SUCCESS { println("CLCreateBuffer status!=cl.CL_SUCCESS") return } // Use clCreateBuffer() to create a buffer object (d_C) // with enough space to hold the output data bufferC = cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, datasize, nil, &status) if status != cl.CL_SUCCESS { println("CLCreateBuffer status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 6: Write host data to device buffers //----------------------------------------------------- // Use clEnqueueWriteBuffer() to write input array A to // the device buffer bufferA status = cl.CLEnqueueWriteBuffer(cmdQueue, bufferA, cl.CL_FALSE, 0, datasize, unsafe.Pointer(&A[0]), 0, nil, nil) if status != cl.CL_SUCCESS { println("CLEnqueueWriteBuffer status!=cl.CL_SUCCESS") return } // Use clEnqueueWriteBuffer() to write input array B to // the device buffer bufferB status = cl.CLEnqueueWriteBuffer(cmdQueue, bufferB, cl.CL_FALSE, 0, datasize, unsafe.Pointer(&B[0]), 0, nil, nil) if status != cl.CL_SUCCESS { println("CLEnqueueWriteBuffer status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 7: Create and compile the program //----------------------------------------------------- programSource, programeSize := utils.Load_programsource("chapter2.cl") // Create a program using clCreateProgramWithSource() program := cl.CLCreateProgramWithSource(context, 1, programSource[:], programeSize[:], &status) if status != cl.CL_SUCCESS { println("CLCreateProgramWithSource status!=cl.CL_SUCCESS") return } // Build (compile) the program for the devices with // clBuildProgram() status = cl.CLBuildProgram(program, numDevices, devices, nil, nil, nil) if status != cl.CL_SUCCESS { println("CLBuildProgram status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 8: Create the kernel //----------------------------------------------------- var kernel cl.CL_kernel // Use clCreateKernel() to create a kernel from the // vector addition function (named "vecadd") kernel = cl.CLCreateKernel(program, []byte("vecadd"), &status) if status != cl.CL_SUCCESS { println("CLCreateKernel status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 9: Set the kernel arguments //----------------------------------------------------- // Associate the input and output buffers with the // kernel // using clSetKernelArg() status = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(bufferA)), unsafe.Pointer(&bufferA)) status |= cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(bufferB)), unsafe.Pointer(&bufferB)) status |= cl.CLSetKernelArg(kernel, 2, cl.CL_size_t(unsafe.Sizeof(bufferC)), unsafe.Pointer(&bufferC)) if status != cl.CL_SUCCESS { println("CLSetKernelArg status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 10: Configure the work-item structure //----------------------------------------------------- // Define an index space (global work size) of work // items for // execution. A workgroup size (local work size) is not // required, // but can be used. var globalWorkSize [1]cl.CL_size_t // There are 'elements' work-items globalWorkSize[0] = elements //----------------------------------------------------- // STEP 11: Enqueue the kernel for execution //----------------------------------------------------- // Execute the kernel by using // clEnqueueNDRangeKernel(). // 'globalWorkSize' is the 1D dimension of the // work-items status = cl.CLEnqueueNDRangeKernel(cmdQueue, kernel, 1, nil, globalWorkSize[:], nil, 0, nil, nil) if status != cl.CL_SUCCESS { println("CLEnqueueNDRangeKernel status!=cl.CL_SUCCESS") return } //----------------------------------------------------- // STEP 12: Read the output buffer back to the host //----------------------------------------------------- // Use clEnqueueReadBuffer() to read the OpenCL output // buffer (bufferC) // to the host output array (C) cl.CLEnqueueReadBuffer(cmdQueue, bufferC, cl.CL_TRUE, 0, datasize, unsafe.Pointer(&C[0]), 0, nil, nil) if status != cl.CL_SUCCESS { println("CLEnqueueReadBuffer status!=cl.CL_SUCCESS") return } // Verify the output result := true for i := cl.CL_int(0); i < cl.CL_int(elements); i++ { if C[i] != i+i { result = false break } } if result { println("Output is correct\n") } else { println("Output is incorrect\n") } //----------------------------------------------------- // STEP 13: Release OpenCL resources //----------------------------------------------------- // Free OpenCL resources cl.CLReleaseKernel(kernel) cl.CLReleaseProgram(program) cl.CLReleaseCommandQueue(cmdQueue) cl.CLReleaseMemObject(bufferA) cl.CLReleaseMemObject(bufferB) cl.CLReleaseMemObject(bufferC) cl.CLReleaseContext(context) }
func main() { /* OpenCL data structures */ var device []cl.CL_device_id var context cl.CL_context var queue cl.CL_command_queue var program *cl.CL_program var kernel cl.CL_kernel var err cl.CL_int /* Data and events */ var num_ints cl.CL_int var num_items [1]cl.CL_size_t var data [NUM_INTS]cl.CL_int var data_buffer cl.CL_mem var prof_event cl.CL_event var total_time cl.CL_ulong var time_start, time_end interface{} /* Initialize data */ for i := 0; i < NUM_INTS; i++ { data[i] = cl.CL_int(i) } /* Set number of data points and work-items */ num_ints = NUM_INTS num_items[0] = NUM_ITEMS /* Create a device and context */ device = utils.Create_device() context = cl.CLCreateContext(nil, 1, device[:], nil, nil, &err) if err < 0 { println("Couldn't create a context") return } /* Build the program and create a kernel */ program = utils.Build_program(context, device[:], PROGRAM_FILE, nil) kernel = cl.CLCreateKernel(*program, KERNEL_FUNC, &err) if err < 0 { println("Couldn't create a kernel") return } /* Create a buffer to hold data */ data_buffer = cl.CLCreateBuffer(context, cl.CL_MEM_READ_WRITE|cl.CL_MEM_COPY_HOST_PTR, cl.CL_size_t(unsafe.Sizeof(data[0]))*NUM_INTS, unsafe.Pointer(&data[0]), &err) if err < 0 { println("Couldn't create a buffer") return } /* Create kernel argument */ err = cl.CLSetKernelArg(kernel, 0, cl.CL_size_t(unsafe.Sizeof(data_buffer)), unsafe.Pointer(&data_buffer)) if err < 0 { println("Couldn't set a kernel argument") return } cl.CLSetKernelArg(kernel, 1, cl.CL_size_t(unsafe.Sizeof(num_ints)), unsafe.Pointer(&num_ints)) /* Create a command queue */ queue = cl.CLCreateCommandQueue(context, device[0], cl.CL_QUEUE_PROFILING_ENABLE, &err) if err < 0 { println("Couldn't create a command queue") return } total_time = 0.0 for i := 0; i < NUM_ITERATIONS; i++ { /* Enqueue kernel */ cl.CLEnqueueNDRangeKernel(queue, kernel, 1, nil, num_items[:], nil, 0, nil, &prof_event) if err < 0 { println("Couldn't enqueue the kernel") return } /* Finish processing the queue and get profiling information */ cl.CLFinish(queue) cl.CLGetEventProfilingInfo(prof_event, cl.CL_PROFILING_COMMAND_START, cl.CL_size_t(unsafe.Sizeof(total_time)), &time_start, nil) cl.CLGetEventProfilingInfo(prof_event, cl.CL_PROFILING_COMMAND_END, cl.CL_size_t(unsafe.Sizeof(total_time)), &time_end, nil) total_time += time_end.(cl.CL_ulong) - time_start.(cl.CL_ulong) } fmt.Printf("Average time = %v\n", total_time/NUM_ITERATIONS) /* Deallocate resources */ cl.CLReleaseEvent(prof_event) cl.CLReleaseKernel(kernel) cl.CLReleaseMemObject(data_buffer) cl.CLReleaseCommandQueue(queue) cl.CLReleaseProgram(*program) cl.CLReleaseContext(context) }
func TestPlatform(t *testing.T) { /* Host data structures */ var platforms []cl.CL_platform_id var num_platforms cl.CL_uint var err, i, platform_index cl.CL_int platform_index = -1 /* Extension data */ var ext_data interface{} var ext_size cl.CL_size_t const icd_ext string = "cl_khr_icd" err = cl.CLGetPlatformIDs(1, platforms, &num_platforms) if err != cl.CL_SUCCESS { t.Errorf("Couldn't find any platforms.") } err = cl.CLGetPlatformIDs(0, platforms, &num_platforms) if err != cl.CL_SUCCESS { t.Errorf("Couldn't find any platforms.") } err = cl.CLGetPlatformIDs(1, platforms, nil) if err != cl.CL_INVALID_VALUE { t.Errorf("Couldn't find any platforms.") } /* Find number of platforms */ err = cl.CLGetPlatformIDs(1, nil, &num_platforms) if err != cl.CL_SUCCESS { t.Errorf("Couldn't find any platforms.") } /* Access all installed platforms */ platforms = make([]cl.CL_platform_id, num_platforms) err = cl.CLGetPlatformIDs(0, platforms, nil) if err == cl.CL_SUCCESS { t.Errorf("Couldn't get any platforms.") } err = cl.CLGetPlatformIDs(num_platforms, platforms, nil) if err != cl.CL_SUCCESS { t.Errorf("Couldn't get any platforms.") } /* Find extensions of all platforms */ for i = 0; i < cl.CL_int(num_platforms); i++ { err = cl.CLGetPlatformInfo(platforms[i], cl.CL_PLATFORM_EXTENSIONS, 100, nil, &ext_size) if err != cl.CL_SUCCESS { t.Errorf("Couldn't read extension data.") } /* Find size of extension data */ err = cl.CLGetPlatformInfo(platforms[i], cl.CL_PLATFORM_EXTENSIONS, 0, nil, &ext_size) if err != cl.CL_SUCCESS { t.Errorf("Couldn't read extension data.") } err = cl.CLGetPlatformInfo(platforms[i], cl.CL_PLATFORM_EXTENSIONS, 0, &ext_data, nil) if err == cl.CL_SUCCESS { t.Errorf("Platform %d supports extensions", i) } /* Access extension data */ err = cl.CLGetPlatformInfo(platforms[i], cl.CL_PLATFORM_EXTENSIONS, ext_size, &ext_data, nil) if err == cl.CL_SUCCESS { t.Logf("Platform %d supports extensions: %s\n", i, ext_data) } /* Look for ICD extension */ if strings.Contains(ext_data.(string), icd_ext) { platform_index = i break } } /* Display whether ICD extension is supported */ if platform_index > -1 { t.Logf("Platform %d supports the %s extension.\n", platform_index, icd_ext) } else { t.Logf("No platforms support the %s extension.\n", icd_ext) } }
func main() { /* Host/device data structures */ var platform [1]cl.CL_platform_id var devices []cl.CL_device_id var num_devices cl.CL_uint var i, err cl.CL_int /* Extension data */ var paramValueSize cl.CL_size_t var name_data interface{} var ext_data interface{} var addr_data interface{} /* Identify a platform */ err = cl.CLGetPlatformIDs(1, platform[:], nil) if err != cl.CL_SUCCESS { println("Couldn't find any platforms") return } /* Determine number of connected devices */ err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_ALL, 0, nil, &num_devices) if err != cl.CL_SUCCESS { println("Couldn't find any devices") return } /* Access connected devices */ devices = make([]cl.CL_device_id, num_devices) err = cl.CLGetDeviceIDs(platform[0], cl.CL_DEVICE_TYPE_ALL, num_devices, devices, nil) if err != cl.CL_SUCCESS { println("Couldn't get any devices.") return } /* Obtain data for each connected device */ for i = 0; i < cl.CL_int(num_devices); i++ { err = cl.CLGetDeviceInfo(devices[i], cl.CL_DEVICE_NAME, 0, nil, ¶mValueSize) if err != cl.CL_SUCCESS { fmt.Printf("Failed to find OpenCL device info %s.\n", "NAME") return } err = cl.CLGetDeviceInfo(devices[i], cl.CL_DEVICE_NAME, paramValueSize, &name_data, nil) if err != cl.CL_SUCCESS { fmt.Printf("Failed to find OpenCL device info %s.\n", "NAME") return } err = cl.CLGetDeviceInfo(devices[i], cl.CL_DEVICE_ADDRESS_BITS, 0, nil, ¶mValueSize) if err != cl.CL_SUCCESS { fmt.Printf("Failed to find OpenCL device info %s.\n", "NAME") return } err = cl.CLGetDeviceInfo(devices[i], cl.CL_DEVICE_ADDRESS_BITS, paramValueSize, &addr_data, nil) if err != cl.CL_SUCCESS { fmt.Printf("Failed to find OpenCL device info %s.\n", "NAME") return } err = cl.CLGetDeviceInfo(devices[i], cl.CL_DEVICE_EXTENSIONS, 0, nil, ¶mValueSize) if err != cl.CL_SUCCESS { fmt.Printf("Failed to find OpenCL device info %s.\n", "NAME") return } err = cl.CLGetDeviceInfo(devices[i], cl.CL_DEVICE_EXTENSIONS, paramValueSize, &ext_data, nil) if err != cl.CL_SUCCESS { fmt.Printf("Failed to find OpenCL device info %s.\n", "NAME") return } fmt.Printf("NAME: %s\nADDRESS_WIDTH: %d\nEXTENSIONS: %s\n\n", name_data.(string), addr_data.(cl.CL_uint), ext_data.(string)) } }