Ejemplo n.º 1
0
Archivo: bst.go Proyecto: xfong/gocl
func svmCompareResults(commandQueue cl.CL_command_queue,
	svmSearchBuf unsafe.Pointer) bool {
	var compare_status bool
	var status cl.CL_int

	status = cl.CLEnqueueSVMMap(commandQueue,
		cl.CL_TRUE, //blocking call
		cl.CL_MAP_WRITE_INVALIDATE_REGION,
		svmSearchBuf,
		cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)),
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmSearchBuf)")

	compare_status = true
	for i := 0; i < NUMBER_OF_SEARCH_KEY; i++ {
		currKey := (*searchKey)(unsafe.Pointer(uintptr(svmSearchBuf) + uintptr(i)*unsafe.Sizeof(sampleKey)))

		/* compare OCL and native nodes */
		if currKey.oclNode != currKey.nativeNode {
			compare_status = false
			break
		}
	}

	status = cl.CLEnqueueSVMUnmap(commandQueue,
		svmSearchBuf,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmSearchBuf)")

	return compare_status
}
Ejemplo n.º 2
0
Archivo: bst.go Proyecto: xfong/gocl
func cpuInitSearchKeys(commandQueue cl.CL_command_queue,
	svmSearchBuf unsafe.Pointer) {
	var nextData *searchKey
	var status cl.CL_int

	status = cl.CLEnqueueSVMMap(commandQueue,
		cl.CL_TRUE, //blocking call
		cl.CL_MAP_WRITE_INVALIDATE_REGION,
		svmSearchBuf,
		cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)),
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmSearchBuf)")

	r := rand.New(rand.NewSource(999))

	// initialize nodes
	for i := 0; i < NUMBER_OF_SEARCH_KEY; i++ {
		nextData = (*searchKey)(unsafe.Pointer(uintptr(svmSearchBuf) + uintptr(i)*unsafe.Sizeof(sampleKey)))
		// allocate a random value to node
		nextData.key = cl.CL_int(r.Int())
		// all pointers are null
		nextData.oclNode = nil
		nextData.nativeNode = nil
	}

	status = cl.CLEnqueueSVMUnmap(commandQueue,
		svmSearchBuf,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmSearchBuf)")
}
Ejemplo n.º 3
0
Archivo: bst.go Proyecto: xfong/gocl
/**
 * cpuCreateBinaryTree()
 * creates a tree from the data in "svmTreeBuf". If this is NULL returns NULL
 * else returns root of the tree.
 **/
func cpuCreateBinaryTree(commandQueue cl.CL_command_queue,
	svmTreeBuf unsafe.Pointer) *node {
	var root *node
	var status cl.CL_int

	// reserve svm space for CPU update
	status = cl.CLEnqueueSVMMap(commandQueue,
		cl.CL_TRUE, //blocking call
		cl.CL_MAP_WRITE_INVALIDATE_REGION,
		svmTreeBuf,
		cl.CL_size_t(NUMBER_OF_NODES*unsafe.Sizeof(sampleNode)),
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmTreeBuf)")

	//init node and make bt
	root = cpuMakeBinaryTree(svmTreeBuf)

	status = cl.CLEnqueueSVMUnmap(commandQueue,
		svmTreeBuf,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmTreeBuf)")

	return root
}
Ejemplo n.º 4
0
Archivo: pipe.go Proyecto: xfong/gocl
func main() {
	// Use this to check the output of each API call
	var status cl.CL_int

	//-----------------------------------------------------
	// STEP 1: Discover and initialize the platforms
	//-----------------------------------------------------
	var numPlatforms cl.CL_uint
	var platforms []cl.CL_platform_id

	// Use clGetPlatformIDs() to retrieve the number of
	// platforms
	status = cl.CLGetPlatformIDs(0, nil, &numPlatforms)

	// Allocate enough space for each platform
	platforms = make([]cl.CL_platform_id, numPlatforms)

	// Fill in platforms with clGetPlatformIDs()
	status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs")

	//-----------------------------------------------------
	// STEP 2: Discover and initialize the GPU devices
	//-----------------------------------------------------
	var numDevices cl.CL_uint
	var devices []cl.CL_device_id

	// Use clGetDeviceIDs() to retrieve the number of
	// devices present
	status = cl.CLGetDeviceIDs(platforms[0],
		cl.CL_DEVICE_TYPE_GPU,
		0,
		nil,
		&numDevices)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs")

	// Allocate enough space for each device
	devices = make([]cl.CL_device_id, numDevices)

	// Fill in devices with clGetDeviceIDs()
	status = cl.CLGetDeviceIDs(platforms[0],
		cl.CL_DEVICE_TYPE_GPU,
		numDevices,
		devices,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs")

	//-----------------------------------------------------
	// STEP 3: Create a context
	//-----------------------------------------------------
	var context cl.CL_context

	// Create a context using clCreateContext() and
	// associate it with the devices
	context = cl.CLCreateContext(nil,
		numDevices,
		devices,
		nil,
		nil,
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext")
	defer cl.CLReleaseContext(context)

	//-----------------------------------------------------
	// STEP 4: Create a command queue
	//-----------------------------------------------------
	var commandQueue [MAX_COMMAND_QUEUE]cl.CL_command_queue

	// Create a command queue using clCreateCommandQueueWithProperties(),
	// and associate it with the device you want to execute
	for i := 0; i < MAX_COMMAND_QUEUE; i++ {
		commandQueue[i] = cl.CLCreateCommandQueueWithProperties(context,
			devices[0],
			nil,
			&status)
		utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties")
		defer cl.CLReleaseCommandQueue(commandQueue[i])
	}

	//-----------------------------------------------------
	// STEP 5: Create device buffers
	//-----------------------------------------------------
	producerGroupSize := cl.CL_size_t(PRODUCER_GROUP_SIZE)
	producerGlobalSize := cl.CL_size_t(PRODUCER_GLOBAL_SIZE)

	consumerGroupSize := cl.CL_size_t(CONSUMER_GROUP_SIZE)
	consumerGlobalSize := cl.CL_size_t(CONSUMER_GLOBAL_SIZE)

	var samplePipePkt [2]cl.CL_float
	szPipe := cl.CL_uint(PIPE_SIZE)
	szPipePkt := cl.CL_uint(unsafe.Sizeof(samplePipePkt))
	if szPipe%PRNG_CHANNELS != 0 {
		szPipe = (szPipe/PRNG_CHANNELS)*PRNG_CHANNELS + PRNG_CHANNELS
	}
	consumerGlobalSize = cl.CL_size_t(szPipe)
	pipePktPerThread := cl.CL_int(szPipe) / PRNG_CHANNELS
	seed := cl.CL_int(SEED)
	rngType := cl.CL_int(RV_GAUSSIAN)
	var histMin cl.CL_float
	var histMax cl.CL_float
	if rngType == cl.CL_int(RV_UNIFORM) {
		histMin = 0.0
		histMax = 1.0
	} else {
		histMin = -10.0
		histMax = 10.0
	}

	localDevHist := make([]cl.CL_int, MAX_HIST_BINS)
	cpuHist := make([]cl.CL_int, MAX_HIST_BINS)

	//Create and initialize memory objects
	rngPipe := cl.CLCreatePipe(context,
		cl.CL_MEM_READ_WRITE,
		szPipePkt,
		szPipe,
		nil,
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreatePipe")

	devHist := cl.CLCreateBuffer(context,
		cl.CL_MEM_READ_WRITE|cl.CL_MEM_COPY_HOST_PTR,
		MAX_HIST_BINS*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])),
		unsafe.Pointer(&localDevHist[0]),
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clCreateBuffer")

	//-----------------------------------------------------
	// STEP 6: Create and compile the program
	//-----------------------------------------------------
	programSource, programeSize := utils.Load_programsource("pipe.cl")

	// Create a program using clCreateProgramWithSource()
	program := cl.CLCreateProgramWithSource(context,
		1,
		programSource[:],
		programeSize[:],
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource")
	defer cl.CLReleaseProgram(program)

	// Build (compile) the program for the devices with
	// clBuildProgram()
	options := "-cl-std=CL2.0"
	status = cl.CLBuildProgram(program,
		numDevices,
		devices,
		[]byte(options),
		nil,
		nil)
	if status != cl.CL_SUCCESS {
		var program_log interface{}
		var log_size cl.CL_size_t

		/* Find size of log and print to std output */
		cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG,
			0, nil, &log_size)
		cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG,
			log_size, &program_log, nil)
		fmt.Printf("%s\n", program_log)
		return
	}
	//utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram")

	//-----------------------------------------------------
	// STEP 7: Create the kernel
	//-----------------------------------------------------
	// Use clCreateKernel() to create a kernel
	produceKernel := cl.CLCreateKernel(program, []byte("pipe_producer"), &status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel")
	defer cl.CLReleaseKernel(produceKernel)

	consumeKernel := cl.CLCreateKernel(program, []byte("pipe_consumer"), &status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel")
	defer cl.CLReleaseKernel(consumeKernel)

	//-----------------------------------------------------
	// STEP 8: Set the kernel arguments
	//-----------------------------------------------------
	// Associate the input and output buffers with the
	// kernel
	// using clSetKernelArg()
	// Set appropriate arguments to the kernel
	status = cl.CLSetKernelArg(produceKernel,
		0,
		cl.CL_size_t(unsafe.Sizeof(rngPipe)),
		unsafe.Pointer(&rngPipe))

	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)")

	status = cl.CLSetKernelArg(produceKernel,
		1,
		cl.CL_size_t(unsafe.Sizeof(pipePktPerThread)),
		unsafe.Pointer(&pipePktPerThread))
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(pipePktPerThread)")

	status = cl.CLSetKernelArg(produceKernel,
		2,
		cl.CL_size_t(unsafe.Sizeof(seed)),
		unsafe.Pointer(&seed))
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(seed)")

	status = cl.CLSetKernelArg(produceKernel,
		3,
		cl.CL_size_t(unsafe.Sizeof(rngType)),
		unsafe.Pointer(&rngType))
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngType)")

	//-----------------------------------------------------
	// STEP 9: Configure the work-item structure
	//-----------------------------------------------------
	// Define an index space (global work size) of work
	// items for
	// execution. A workgroup size (local work size) is not
	// required,
	// but can be used.
	// Enqueue both the kernels.
	var globalThreads = []cl.CL_size_t{producerGlobalSize}
	var localThreads = []cl.CL_size_t{producerGroupSize}

	//-----------------------------------------------------
	// STEP 10: Enqueue the kernel for execution
	//-----------------------------------------------------
	// Execute the kernel by using
	// clEnqueueNDRangeKernel().
	// 'globalWorkSize' is the 1D dimension of the
	// work-items
	var produceEvt [1]cl.CL_event
	status = cl.CLEnqueueNDRangeKernel(commandQueue[0],
		produceKernel,
		1,
		nil,
		globalThreads,
		localThreads,
		0,
		nil,
		&produceEvt[0])
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel")

	/*
	   launch consumer kernel only after producer has finished.
	   This is done to avoid concurrent kernels execution as the
	   memory consistency of pipe is guaranteed only across
	   synchronization points.
	*/
	status = cl.CLWaitForEvents(1, produceEvt[:])
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clWaitForEvents(produceEvt)")

	//-----------------------------------------------------
	// STEP 8: Set the kernel arguments
	//-----------------------------------------------------
	// Associate the input and output buffers with the
	// kernel
	// using clSetKernelArg()
	// Set appropriate arguments to the kernel
	status = cl.CLSetKernelArg(consumeKernel,
		0,
		cl.CL_size_t(unsafe.Sizeof(rngPipe)),
		unsafe.Pointer(&rngPipe))
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(rngPipe)")

	status = cl.CLSetKernelArg(consumeKernel,
		1,
		cl.CL_size_t(unsafe.Sizeof(devHist)),
		unsafe.Pointer(&devHist))
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(devHist)")

	status = cl.CLSetKernelArg(consumeKernel,
		2,
		cl.CL_size_t(unsafe.Sizeof(histMin)),
		unsafe.Pointer(&histMin))
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMin)")

	status = cl.CLSetKernelArg(consumeKernel,
		3,
		cl.CL_size_t(unsafe.Sizeof(histMax)),
		unsafe.Pointer(&histMax))
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArg(histMax)")

	//-----------------------------------------------------
	// STEP 9: Configure the work-item structure
	//-----------------------------------------------------
	// Define an index space (global work size) of work
	// items for
	// execution. A workgroup size (local work size) is not
	// required,
	// but can be used.
	globalThreads[0] = consumerGlobalSize
	localThreads[0] = consumerGroupSize

	//-----------------------------------------------------
	// STEP 10: Enqueue the kernel for execution
	//-----------------------------------------------------
	// Execute the kernel by using
	// clEnqueueNDRangeKernel().
	// 'globalWorkSize' is the 1D dimension of the
	// work-items
	var consumeEvt [1]cl.CL_event
	status = cl.CLEnqueueNDRangeKernel(
		commandQueue[1],
		consumeKernel,
		1,
		nil,
		globalThreads,
		localThreads,
		0,
		nil,
		&consumeEvt[0])
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueNDRangeKernel")

	status = cl.CLFlush(commandQueue[0])
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(0)")

	status = cl.CLFlush(commandQueue[1])
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush(1)")

	//wait for kernels to finish
	status = cl.CLFinish(commandQueue[0])
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(0)")

	status = cl.CLFinish(commandQueue[1])
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish(1)")

	//-----------------------------------------------------
	// STEP 11: Read the output buffer back to the host
	//-----------------------------------------------------
	// Use clEnqueueReadBuffer() to read the OpenCL output
	// buffer (bufferC)
	// to the host output array (C)
	//copy the data back to host buffer
	var readEvt cl.CL_event
	status = cl.CLEnqueueReadBuffer(commandQueue[1],
		devHist,
		cl.CL_TRUE,
		0,
		(MAX_HIST_BINS)*cl.CL_size_t(unsafe.Sizeof(localDevHist[0])),
		unsafe.Pointer(&localDevHist[0]),
		0,
		nil,
		&readEvt)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueReadBuffer")

	//-----------------------------------------------------
	// STEP 12: Verify the results
	//-----------------------------------------------------
	//Find the tolerance limit
	fTol := (float32)(CONSUMER_GLOBAL_SIZE) * (float32)(COMP_TOL) / (float32)(100.0)
	iTol := (int)(fTol)
	if iTol == 0 {
		iTol = 1
	}

	//CPU side histogram computation
	CPUReference(seed, pipePktPerThread, rngType, cpuHist, histMax, histMin)

	//Compare
	for bin := 0; bin < MAX_HIST_BINS; bin++ {
		diff := int(localDevHist[bin] - cpuHist[bin])

		if diff < 0 {
			diff = -diff
		}
		if diff > iTol {
			println("Failed!")
			return
		}
	}

	println("Passed!")
}
Ejemplo n.º 5
0
Archivo: bst.go Proyecto: xfong/gocl
func svmBinaryTreeCPUReference(commandQueue cl.CL_command_queue,
	svmRoot *node,
	svmTreeBuf unsafe.Pointer,
	svmSearchBuf unsafe.Pointer) {
	var status cl.CL_int

	/* reserve svm buffers for cpu usage */
	status = cl.CLEnqueueSVMMap(commandQueue,
		cl.CL_TRUE, //blocking call
		cl.CL_MAP_READ,
		svmTreeBuf,
		cl.CL_size_t(NUMBER_OF_NODES*unsafe.Sizeof(sampleNode)),
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmTreeBuf)")

	status = cl.CLEnqueueSVMMap(commandQueue,
		cl.CL_TRUE, //blocking call
		cl.CL_MAP_WRITE,
		svmSearchBuf,
		cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)),
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMMap(svmSearchBuf)")

	for i := 0; i < NUMBER_OF_SEARCH_KEY; i++ {
		/* search tree */
		searchNode := svmRoot
		currKey := (*searchKey)(unsafe.Pointer(uintptr(svmSearchBuf) + uintptr(i)*unsafe.Sizeof(sampleKey)))

		for nil != searchNode {
			if currKey.key == searchNode.value {
				/* rejoice on finding key */
				currKey.nativeNode = searchNode
				searchNode = nil
			} else if currKey.key < searchNode.value {
				/* move left */
				searchNode = searchNode.left
			} else {
				/* move right */
				searchNode = searchNode.right
			}
		}
	}

	status = cl.CLEnqueueSVMUnmap(commandQueue,
		svmSearchBuf,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmSearchBuf)")

	status = cl.CLEnqueueSVMUnmap(commandQueue,
		svmTreeBuf,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clEnqueueSVMUnmap(svmTreeBuf)")
}
Ejemplo n.º 6
0
Archivo: bst.go Proyecto: xfong/gocl
func main() {
	// Use this to check the output of each API call
	var status cl.CL_int

	//-----------------------------------------------------
	// STEP 1: Discover and initialize the platforms
	//-----------------------------------------------------
	var numPlatforms cl.CL_uint
	var platforms []cl.CL_platform_id

	// Use clGetPlatformIDs() to retrieve the number of
	// platforms
	status = cl.CLGetPlatformIDs(0, nil, &numPlatforms)

	// Allocate enough space for each platform
	platforms = make([]cl.CL_platform_id, numPlatforms)

	// Fill in platforms with clGetPlatformIDs()
	status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs")

	//-----------------------------------------------------
	// STEP 2: Discover and initialize the GPU devices
	//-----------------------------------------------------
	var numDevices cl.CL_uint
	var devices []cl.CL_device_id

	// Use clGetDeviceIDs() to retrieve the number of
	// devices present
	status = cl.CLGetDeviceIDs(platforms[0],
		cl.CL_DEVICE_TYPE_GPU,
		0,
		nil,
		&numDevices)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs")

	// Allocate enough space for each device
	devices = make([]cl.CL_device_id, numDevices)

	// Fill in devices with clGetDeviceIDs()
	status = cl.CLGetDeviceIDs(platforms[0],
		cl.CL_DEVICE_TYPE_GPU,
		numDevices,
		devices,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs")

	//-----------------------------------------------------
	// STEP 3: Create a context
	//-----------------------------------------------------
	var context cl.CL_context

	// Create a context using clCreateContext() and
	// associate it with the devices
	context = cl.CLCreateContext(nil,
		numDevices,
		devices,
		nil,
		nil,
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext")
	defer cl.CLReleaseContext(context)

	//-----------------------------------------------------
	// STEP 4: Create a command queue
	//-----------------------------------------------------
	var cmdQueue cl.CL_command_queue

	// Create a command queue using clCreateCommandQueueWithProperties(),
	// and associate it with the device you want to execute
	cmdQueue = cl.CLCreateCommandQueueWithProperties(context,
		devices[0],
		nil,
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties")
	defer cl.CLReleaseCommandQueue(cmdQueue)

	//-----------------------------------------------------
	// STEP 5: Create device buffers
	//-----------------------------------------------------
	// initialize any device/SVM memory here.

	/* svm buffer for binary tree */
	svmTreeBuf := cl.CLSVMAlloc(context,
		cl.CL_MEM_READ_WRITE,
		cl.CL_size_t(NUMBER_OF_NODES*unsafe.Sizeof(sampleNode)),
		0)
	if nil == svmTreeBuf {
		println("clSVMAlloc(svmTreeBuf) failed.")
		return
	}
	defer cl.CLSVMFree(context, svmTreeBuf)

	/* svm buffer for search keys */
	svmSearchBuf := cl.CLSVMAlloc(context,
		cl.CL_MEM_READ_WRITE,
		cl.CL_size_t(NUMBER_OF_SEARCH_KEY*unsafe.Sizeof(sampleKey)),
		0)
	if nil == svmSearchBuf {
		println("clSVMAlloc(svmSearchBuf) failed.")
		return
	}
	defer cl.CLSVMFree(context, svmSearchBuf)

	//create the binary tree and set the root
	/* root node of the binary tree */
	svmRoot := cpuCreateBinaryTree(cmdQueue, svmTreeBuf)

	//initialize search keys
	cpuInitSearchKeys(cmdQueue, svmSearchBuf)

	/* if voice is not deliberately muzzled, shout parameters */
	fmt.Printf("-------------------------------------------------------------------------\n")
	fmt.Printf("Searching %d keys in a BST having %d Nodes...\n", NUMBER_OF_SEARCH_KEY, NUMBER_OF_NODES)
	fmt.Printf("-------------------------------------------------------------------------\n")

	//-----------------------------------------------------
	// STEP 6: Create and compile the program
	//-----------------------------------------------------
	programSource, programeSize := utils.Load_programsource("bst.cl")

	// Create a program using clCreateProgramWithSource()
	program := cl.CLCreateProgramWithSource(context,
		1,
		programSource[:],
		programeSize[:],
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource")
	defer cl.CLReleaseProgram(program)

	// Build (compile) the program for the devices with
	// clBuildProgram()
	options := "-cl-std=CL2.0"
	status = cl.CLBuildProgram(program,
		numDevices,
		devices,
		[]byte(options),
		nil,
		nil)
	if status != cl.CL_SUCCESS {
		var program_log interface{}
		var log_size cl.CL_size_t

		/* Find size of log and print to std output */
		cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG,
			0, nil, &log_size)
		cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG,
			log_size, &program_log, nil)
		fmt.Printf("%s\n", program_log)
		return
	}
	//utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram")

	//-----------------------------------------------------
	// STEP 7: Create the kernel
	//-----------------------------------------------------
	var kernel cl.CL_kernel

	// Use clCreateKernel() to create a kernel
	kernel = cl.CLCreateKernel(program, []byte("bst_kernel"), &status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel")
	defer cl.CLReleaseKernel(kernel)

	//-----------------------------------------------------
	// STEP 8: Set the kernel arguments
	//-----------------------------------------------------
	// Associate the input and output buffers with the
	// kernel
	// using clSetKernelArg()
	// Set appropriate arguments to the kernel
	status = cl.CLSetKernelArgSVMPointer(kernel,
		0,
		svmTreeBuf)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArgSVMPointer(svmTreeBuf)")

	status = cl.CLSetKernelArgSVMPointer(kernel,
		1,
		svmSearchBuf)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clSetKernelArgSVMPointer(svmSearchBuf)")

	//-----------------------------------------------------
	// STEP 9: Configure the work-item structure
	//-----------------------------------------------------
	// Define an index space (global work size) of work
	// items for
	// execution. A workgroup size (local work size) is not
	// required,
	// but can be used.
	var localWorkSize [1]cl.CL_size_t
	var kernelWorkGroupSize interface{}
	status = cl.CLGetKernelWorkGroupInfo(kernel,
		devices[0],
		cl.CL_KERNEL_WORK_GROUP_SIZE,
		cl.CL_size_t(unsafe.Sizeof(localWorkSize[0])),
		&kernelWorkGroupSize,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetKernelWorkGroupInfo")
	localWorkSize[0] = kernelWorkGroupSize.(cl.CL_size_t)

	var globalWorkSize [1]cl.CL_size_t
	globalWorkSize[0] = NUMBER_OF_SEARCH_KEY

	//-----------------------------------------------------
	// STEP 10: Enqueue the kernel for execution
	//-----------------------------------------------------
	// Execute the kernel by using
	// clEnqueueNDRangeKernel().
	// 'globalWorkSize' is the 1D dimension of the
	// work-items
	status = cl.CLEnqueueNDRangeKernel(cmdQueue,
		kernel,
		1,
		nil,
		globalWorkSize[:],
		localWorkSize[:],
		0,
		nil,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel")

	status = cl.CLFlush(cmdQueue)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFlush")

	status = cl.CLFinish(cmdQueue)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "clFinish")

	//-----------------------------------------------------
	// STEP 11: Read the output buffer back to the host
	//-----------------------------------------------------
	// Use clEnqueueReadBuffer() to read the OpenCL output
	// buffer (bufferC)
	// to the host output array (C)
	//copy the data back to host buffer

	//this demo doesn't need clEnqueueReadBuffer due to SVM

	//-----------------------------------------------------
	// STEP 12: Verify the results
	//-----------------------------------------------------
	// reference implementation
	svmBinaryTreeCPUReference(cmdQueue,
		svmRoot,
		svmTreeBuf,
		svmSearchBuf)

	// compare the results and see if they match
	pass := svmCompareResults(cmdQueue, svmSearchBuf)
	if pass {
		println("Passed!")
	} else {
		println("Failed!")
	}
}
Ejemplo n.º 7
0
Archivo: svmfg.go Proyecto: xfong/gocl
func svmbasic(size cl.CL_size_t,
	context cl.CL_context,
	queue cl.CL_command_queue,
	kernel cl.CL_kernel) {
	// Prepare input data as follows.
	// Build two arrays:
	//     - an array that consists of the Element structures
	//       (refer to svmbasic.h for the structure definition)
	//     - an array that consists of the float values
	//
	// Each structure of the first array has the following pointers:
	//     - 'internal', which points to a 'value' field of another entry
	//       of the same array.
	//     - 'external', which points to a float value from the the
	//       second array.
	//
	// Pointers are set randomly. The structures do not reflect any real usage
	// scenario, but are illustrative for a simple device-side traversal.
	//
	//        Array of Element                        Array of floats
	//           structures
	//
	//    ||====================||
	//    ||    .............   ||                   ||============||
	//    ||    .............   ||<-----+            || .......... ||
	//    ||====================||      |            ||    float   ||
	//    ||   float* internal--||------+            ||    float   ||
	//    ||   float* external--||------------------>||    float   ||
	//    ||   float value <----||------+            || .......... ||
	//    ||====================||      |            || .......... ||
	//    ||    .............   ||      |            ||    float   ||
	//    ||    .............   ||      |            ||    float   ||
	//    ||====================||      |            ||    float   ||
	//    ||====================||      |            ||    float   ||
	//    ||   float* internal--||------+            ||    float   ||
	//    ||   float* external--||------------------>||    float   ||
	//    ||   float value      ||                   ||    float   ||
	//    ||====================||                   ||    float   ||
	//    ||    .............   ||                   || .......... ||
	//    ||    .............   ||                   ||============||
	//    ||====================||
	//
	// The two arrays are created independently and are used to illustrate
	// two new OpenCL 2.0 API functions:
	//    - the array of Element structures is passed to the kernel as a
	//      kernel argument with the clSetKernelArgSVMPointer function
	//    - the array of floats is used by the kernel indirectly, and this
	//      dependency should be also specified with the clSetKernelExecInfo
	//      function prior to the kernel execution

	var err cl.CL_int

	// To enable host & device code to share pointer to the same address space
	// the arrays should be allocated as SVM memory. Use the clSVMAlloc function
	// to allocate SVM memory.
	//
	// Optionally, this function allows specifying alignment in bytes as its
	// last argument. As this basic example doesn't require any _special_ alignment,
	// the following code illustrates requesting default alignment via passing
	// zero value.

	inputElements := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used
		cl.CL_MEM_READ_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER,
		size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), // amount of memory to allocate (in bytes)
		0) // alignment in bytes (0 means default)
	if nil == inputElements {
		println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.")
		return
	}
	defer cl.CLSVMFree(context, inputElements)

	inputFloats := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used
		cl.CL_MEM_READ_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER,
		size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes)
		0) // alignment in bytes (0 means default)
	if nil == inputFloats {
		println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.")
		return
	}
	defer cl.CLSVMFree(context, inputFloats)

	// The OpenCL kernel uses the aforementioned input arrays to compute
	// values for the output array.

	output := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used
		cl.CL_MEM_WRITE_ONLY|cl.CL_MEM_SVM_FINE_GRAIN_BUFFER,
		size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes)
		0) // alignment in bytes (0 means default)
	defer cl.CLSVMFree(context, output)

	if nil == output {
		println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.")
		return
	}

	// Note: in the coarse-grained SVM, mapping of inputElement and inputFloats is
	// needed to do the following initialization. While here, in the fine-grained SVM,
	// it is not necessary.

	// Populate data-structures with initial data.
	r := rand.New(rand.NewSource(99))

	for i := cl.CL_size_t(0); i < size; i++ {
		inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement)))
		inputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(i)*unsafe.Sizeof(sampleFloat)))
		randElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleElement)))
		randFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleFloat)))

		inputElement.internal = &(randElement.value)
		inputElement.external = randFloat
		inputElement.value = cl.CL_float(i)
		*inputFloat = cl.CL_float(i + size)
	}

	// Note: in the coarse-grained SVM, unmapping of inputElement and inputFloats is
	// needed before scheduling the kernel for execution. While here, in the fine-grained SVM,
	// it is not necessary.

	// Pass arguments to the kernel.
	// According to the OpenCL 2.0 specification, you need to use a special
	// function to pass a pointer from SVM memory to kernel.

	err = cl.CLSetKernelArgSVMPointer(kernel, 0, inputElements)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer")

	err = cl.CLSetKernelArgSVMPointer(kernel, 1, output)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer")

	// For buffer based SVM (both coarse- and fine-grain) if one SVM buffer
	// points to memory allocated in another SVM buffer, such allocations
	// should be passed to the kernel via clSetKernelExecInfo.

	err = cl.CLSetKernelExecInfo(kernel,
		cl.CL_KERNEL_EXEC_INFO_SVM_PTRS,
		cl.CL_size_t(unsafe.Sizeof(inputFloats)),
		unsafe.Pointer(&inputFloats))
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelExecInfo")

	// Run the kernel.
	println("Running kernel...")

	var globalWorkSize [1]cl.CL_size_t
	globalWorkSize[0] = size

	err = cl.CLEnqueueNDRangeKernel(queue,
		kernel,
		1,
		nil,
		globalWorkSize[:],
		nil,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel")

	// Note: In the fine-grained SVM, after enqueuing the kernel above, the host application is
	// not blocked from accessing SVM allocations that were passed to the kernel. The host
	// can access the same regions of SVM memory as does the kernel if the kernel and the host
	// read/modify different bytes. If one side (host or device) needs to modify the same bytes
	// that are simultaniously read/modified by another side, atomics operations are usually
	// required to maintain sufficient memory consistency. This sample doesn't use this possibility
	// and the host just waits in clFinish below until the kernel is finished.
	err = cl.CLFinish(queue)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLFinish")

	println(" DONE.")

	// Validate output state for correctness.
	// Compare: in the coarse-grained SVM case you need to map the output.
	// Here it is not needed.

	println("Checking correctness of the output buffer...")
	for i := cl.CL_size_t(0); i < size; i++ {
		inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement)))
		outputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(output) + uintptr(i)*unsafe.Sizeof(sampleFloat)))
		expectedValue := *(inputElement.internal) + *(inputElement.external)
		if *outputFloat != expectedValue {
			println(" FAILED.")
			fmt.Printf("Mismatch at position %d, read %f, expected %f\n", i, *outputFloat, expectedValue)
			return
		}
	}
	println(" PASSED.")
}
Ejemplo n.º 8
0
Archivo: svmfg.go Proyecto: xfong/gocl
func main() {
	// Use this to check the output of each API call
	var status cl.CL_int

	//-----------------------------------------------------
	// STEP 1: Discover and initialize the platforms
	//-----------------------------------------------------
	var numPlatforms cl.CL_uint

	// Use clGetPlatformIDs() to retrieve the number of
	// platforms
	status = cl.CLGetPlatformIDs(0, nil, &numPlatforms)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs")

	// Allocate enough space for each platform
	platforms := make([]cl.CL_platform_id, numPlatforms)

	// Fill in platforms with clGetPlatformIDs()
	status = cl.CLGetPlatformIDs(numPlatforms, platforms, nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetPlatformIDs")

	//-----------------------------------------------------
	// STEP 2: Discover and initialize the GPU devices
	//-----------------------------------------------------
	var numDevices cl.CL_uint

	// Use clGetDeviceIDs() to retrieve the number of
	// devices present
	status = cl.CLGetDeviceIDs(platforms[0],
		cl.CL_DEVICE_TYPE_GPU,
		0,
		nil,
		&numDevices)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs")

	// Allocate enough space for each device
	devices := make([]cl.CL_device_id, numDevices)

	// Fill in devices with clGetDeviceIDs()
	status = cl.CLGetDeviceIDs(platforms[0],
		cl.CL_DEVICE_TYPE_GPU,
		numDevices,
		devices,
		nil)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLGetDeviceIDs")

	var caps cl.CL_device_svm_capabilities
	var caps_value interface{}

	status = cl.CLGetDeviceInfo(
		devices[0],
		cl.CL_DEVICE_SVM_CAPABILITIES,
		cl.CL_size_t(unsafe.Sizeof(caps)),
		&caps_value,
		nil)
	caps = caps_value.(cl.CL_device_svm_capabilities)

	// Coarse-grained buffer SVM should be available on any OpenCL 2.0 device.
	// So it is either not an OpenCL 2.0 device or it must support coarse-grained buffer SVM:
	if !(status == cl.CL_SUCCESS && (caps&cl.CL_DEVICE_SVM_FINE_GRAIN_BUFFER) != 0) {
		fmt.Printf("Cannot detect fine-grained buffer SVM capabilities on the device. The device seemingly doesn't support fine-grained buffer SVM. caps=%x\n", caps)
		println("")
		return
	}

	//-----------------------------------------------------
	// STEP 3: Create a context
	//-----------------------------------------------------
	// Create a context using clCreateContext() and
	// associate it with the devices
	context := cl.CLCreateContext(nil,
		numDevices,
		devices,
		nil,
		nil,
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateContext")
	defer cl.CLReleaseContext(context)

	//-----------------------------------------------------
	// STEP 4: Create a command queue
	//-----------------------------------------------------
	// Create a command queue using clCreateCommandQueueWithProperties(),
	// and associate it with the device you want to execute
	queue := cl.CLCreateCommandQueueWithProperties(context,
		devices[0],
		nil,
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateCommandQueueWithProperties")
	defer cl.CLReleaseCommandQueue(queue)

	//-----------------------------------------------------
	// STEP 5: Create and compile the program
	//-----------------------------------------------------
	programSource, programeSize := utils.Load_programsource("svmfg.cl")

	// Create a program using clCreateProgramWithSource()
	program := cl.CLCreateProgramWithSource(context,
		1,
		programSource[:],
		programeSize[:],
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateProgramWithSource")
	defer cl.CLReleaseProgram(program)

	// Build (compile) the program for the devices with
	// clBuildProgram()
	options := "-cl-std=CL2.0"

	status = cl.CLBuildProgram(program,
		numDevices,
		devices,
		[]byte(options),
		nil,
		nil)
	if status != cl.CL_SUCCESS {
		var log interface{}
		var log_size cl.CL_size_t
		/* Find size of log and print to std output */
		cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, 0, nil, &log_size)
		cl.CLGetProgramBuildInfo(program, devices[0], cl.CL_PROGRAM_BUILD_LOG, log_size, &log, nil)
		fmt.Printf("%s\n", log)
		return
	}
	//utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLBuildProgram")

	//-----------------------------------------------------
	// STEP 7: Create the kernel
	//-----------------------------------------------------
	// Use clCreateKernel() to create a kernel
	kernel := cl.CLCreateKernel(program,
		[]byte("svmbasic"),
		&status)
	utils.CHECK_STATUS(status, cl.CL_SUCCESS, "CLCreateKernel")
	defer cl.CLReleaseKernel(kernel)

	// Then call the main sample routine - resource allocations, OpenCL kernel
	// execution, and so on.
	svmbasic(1024*1024, context, queue, kernel)

	// All resource deallocations happen in defer.
}
Ejemplo n.º 9
0
Archivo: svmcg.go Proyecto: xfong/gocl
func svmbasic(size cl.CL_size_t,
	context cl.CL_context,
	queue cl.CL_command_queue,
	kernel cl.CL_kernel) {
	// Prepare input data as follows.
	// Build two arrays:
	//     - an array that consists of the Element structures
	//       (refer to svmbasic.h for the structure definition)
	//     - an array that consists of the float values
	//
	// Each structure of the first array has the following pointers:
	//     - 'internal', which points to a 'value' field of another entry
	//       of the same array.
	//     - 'external', which points to a float value from the the
	//       second array.
	//
	// Pointers are set randomly. The structures do not reflect any real usage
	// scenario, but are illustrative for a simple device-side traversal.
	//
	//        Array of Element                        Array of floats
	//           structures
	//
	//    ||====================||
	//    ||    .............   ||                   ||============||
	//    ||    .............   ||<-----+            || .......... ||
	//    ||====================||      |            ||    float   ||
	//    ||   float* internal--||------+            ||    float   ||
	//    ||   float* external--||------------------>||    float   ||
	//    ||   float value <----||------+            || .......... ||
	//    ||====================||      |            || .......... ||
	//    ||    .............   ||      |            ||    float   ||
	//    ||    .............   ||      |            ||    float   ||
	//    ||====================||      |            ||    float   ||
	//    ||====================||      |            ||    float   ||
	//    ||   float* internal--||------+            ||    float   ||
	//    ||   float* external--||------------------>||    float   ||
	//    ||   float value      ||                   ||    float   ||
	//    ||====================||                   ||    float   ||
	//    ||    .............   ||                   || .......... ||
	//    ||    .............   ||                   ||============||
	//    ||====================||
	//
	// The two arrays are created independently and are used to illustrate
	// two new OpenCL 2.0 API functions:
	//    - the array of Element structures is passed to the kernel as a
	//      kernel argument with the clSetKernelArgSVMPointer function
	//    - the array of floats is used by the kernel indirectly, and this
	//      dependency should be also specified with the clSetKernelExecInfo
	//      function prior to the kernel execution

	var err cl.CL_int

	// To enable host & device code to share pointer to the same address space
	// the arrays should be allocated as SVM memory. Use the clSVMAlloc function
	// to allocate SVM memory.
	//
	// Optionally, this function allows specifying alignment in bytes as its
	// last argument. As this basic example doesn't require any _special_ alignment,
	// the following code illustrates requesting default alignment via passing
	// zero value.

	inputElements := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used
		cl.CL_MEM_READ_ONLY,
		size*cl.CL_size_t(unsafe.Sizeof(sampleElement)), // amount of memory to allocate (in bytes)
		0) // alignment in bytes (0 means default)
	if nil == inputElements {
		println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.")
		return
	}
	defer cl.CLSVMFree(context, inputElements)

	inputFloats := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used
		cl.CL_MEM_READ_ONLY,
		size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes)
		0) // alignment in bytes (0 means default)
	if nil == inputFloats {
		println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.")
		return
	}
	defer cl.CLSVMFree(context, inputFloats)

	// The OpenCL kernel uses the aforementioned input arrays to compute
	// values for the output array.

	output := cl.CLSVMAlloc(context, // the context where this memory is supposed to be used
		cl.CL_MEM_WRITE_ONLY,
		size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)), // amount of memory to allocate (in bytes)
		0) // alignment in bytes (0 means default)
	defer cl.CLSVMFree(context, output)

	if nil == output {
		println("Cannot allocate SVM memory with clSVMAlloc: it returns null pointer. You might be out of memory.")
		return
	}

	// In the coarse-grained buffer SVM model, only one OpenCL device (or
	// host) can have ownership for writing to the buffer. Specifically, host
	// explicitly requests the ownership by mapping/unmapping the SVM buffer.
	//
	// So to fill the input SVM buffers on the host, you need to map them to have
	// access from the host program.
	//
	// The following two map calls are required in case of coarse-grained SVM only.

	err = cl.CLEnqueueSVMMap(queue,
		cl.CL_TRUE, // blocking map
		cl.CL_MAP_WRITE,
		inputElements,
		size*cl.CL_size_t(unsafe.Sizeof(sampleElement)),
		0,
		nil,
		nil)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap")

	err = cl.CLEnqueueSVMMap(queue,
		cl.CL_TRUE, // blocking map
		cl.CL_MAP_WRITE,
		inputFloats,
		size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)),
		0,
		nil,
		nil)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap")

	// Populate data-structures with initial data.
	r := rand.New(rand.NewSource(99))

	for i := cl.CL_size_t(0); i < size; i++ {
		inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement)))
		inputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(i)*unsafe.Sizeof(sampleFloat)))
		randElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleElement)))
		randFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(inputFloats) + uintptr(r.Intn(int(size)))*unsafe.Sizeof(sampleFloat)))

		inputElement.internal = &(randElement.value)
		inputElement.external = randFloat
		inputElement.value = cl.CL_float(i)
		*inputFloat = cl.CL_float(i + size)
	}

	// The following two unmap calls are required in case of coarse-grained SVM only

	err = cl.CLEnqueueSVMUnmap(queue,
		inputElements,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap")

	err = cl.CLEnqueueSVMUnmap(queue,
		inputFloats,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap")

	// Pass arguments to the kernel.
	// According to the OpenCL 2.0 specification, you need to use a special
	// function to pass a pointer from SVM memory to kernel.

	err = cl.CLSetKernelArgSVMPointer(kernel, 0, inputElements)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer")

	err = cl.CLSetKernelArgSVMPointer(kernel, 1, output)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelArgSVMPointer")

	// For buffer based SVM (both coarse- and fine-grain) if one SVM buffer
	// points to memory allocated in another SVM buffer, such allocations
	// should be passed to the kernel via clSetKernelExecInfo.

	err = cl.CLSetKernelExecInfo(kernel,
		cl.CL_KERNEL_EXEC_INFO_SVM_PTRS,
		cl.CL_size_t(unsafe.Sizeof(inputFloats)),
		unsafe.Pointer(&inputFloats))
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLSetKernelExecInfo")

	// Run the kernel.
	println("Running kernel...")

	var globalWorkSize [1]cl.CL_size_t
	globalWorkSize[0] = size

	err = cl.CLEnqueueNDRangeKernel(queue,
		kernel,
		1,
		nil,
		globalWorkSize[:],
		nil,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueNDRangeKernel")

	// Map the output SVM buffer to read the results.
	// Mapping is required for coarse-grained SVM only.

	err = cl.CLEnqueueSVMMap(queue,
		cl.CL_TRUE, // blocking map
		cl.CL_MAP_READ,
		output,
		size*cl.CL_size_t(unsafe.Sizeof(sampleFloat)),
		0,
		nil,
		nil)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMMap")

	println(" DONE.")

	// Validate output state for correctness.

	println("Checking correctness of the output buffer...")
	for i := cl.CL_size_t(0); i < size; i++ {
		inputElement := (*Element)(unsafe.Pointer(uintptr(inputElements) + uintptr(i)*unsafe.Sizeof(sampleElement)))
		outputFloat := (*cl.CL_float)(unsafe.Pointer(uintptr(output) + uintptr(i)*unsafe.Sizeof(sampleFloat)))
		expectedValue := *(inputElement.internal) + *(inputElement.external)
		if *outputFloat != expectedValue {
			println(" FAILED.")
			fmt.Printf("Mismatch at position %d, read %f, expected %f\n", i, *outputFloat, expectedValue)
			return
		}
	}
	println(" PASSED.")

	err = cl.CLEnqueueSVMUnmap(queue,
		output,
		0,
		nil,
		nil)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLEnqueueSVMUnmap")

	err = cl.CLFinish(queue)
	utils.CHECK_STATUS(err, cl.CL_SUCCESS, "CLFinish")
}