func setKernelArg(kernel cl.CL_kernel, pos int, data interface{}) { var status cl.CL_int switch data := data.(type) { case *cl.CL_mem: status = cl.CLSetKernelArg( kernel, cl.CL_uint(pos), cl.CL_size_t(unsafe.Sizeof(data)), unsafe.Pointer(data)) case *cl.CL_uint: status = cl.CLSetKernelArg( kernel, cl.CL_uint(pos), cl.CL_size_t(unsafe.Sizeof(*data)), unsafe.Pointer(data)) default: log.Fatalf("Fatal error: setting kernel arg for unknown type %t.", data) } if status != cl.CL_SUCCESS { log.Printf("%v", cl.ERROR_CODES_STRINGS[-status]) log.Fatalf("Fatal error: could not set arg %d for OpenCL kernel.", pos) } }
func runOpenCL() { var status cl.CL_int var numPlatforms cl.CL_uint //--------------------------------------------------- // Step 1: Discover and retrieve OpenCL platforms. //--------------------------------------------------- status = cl.CLGetPlatformIDs(0, nil, &numPlatforms) platforms := make([]cl.CL_platform_id, numPlatforms) requireSuccess(cl.CLGetPlatformIDs(numPlatforms, platforms, nil), "could not retrieve OpenCL platform IDs.") if verbosity >= 4 { printPlatforms(platforms) } //--------------------------------------------------- // Step 2: Discover and retrieve OpenCL devices. //--------------------------------------------------- var preferredType cl.CL_device_type if useCPU { preferredType = cl.CL_DEVICE_TYPE_CPU } else { preferredType = cl.CL_DEVICE_TYPE_GPU } _, gpuDevice := findDevice(platforms, preferredType) gpuDevices := make([]cl.CL_device_id, 1) gpuDevices[0] = gpuDevice if verbosity >= 4 { printDeviceInfo(gpuDevice) } //--------------------------------------------------- // Step 3: Create an OpenCL context. //--------------------------------------------------- context := cl.CLCreateContext(nil, 1, gpuDevices, nil, nil, &status) requireSuccess(status, "could not create OpenCL context.") defer cl.CLReleaseContext(context) //--------------------------------------------------- // Step 3: Create an OpenCL command queue. //--------------------------------------------------- commandQueue := cl.CLCreateCommandQueue(context, gpuDevice, 0, &status) requireSuccess(status, "could not create OpenCL command queue.") defer cl.CLReleaseCommandQueue(commandQueue) //--------------------------------------------------- // Step 4: Create OpenCL program and kernel. //--------------------------------------------------- var clSourceData [3][]byte var clSourceLengths [3]cl.CL_size_t var err error clSourceFiles := []string{ "kernels/" + problems[problemIndex].clSource, "kernels/rng.cl", "kernels/gom.cl", } for i, s := range clSourceFiles { clSourceData[i], err = ioutil.ReadFile(s) if err != nil { log.Fatalf("Could not read the kernel source file %s.", s) } clSourceLengths[i] = cl.CL_size_t(len(clSourceData[i])) } program := cl.CLCreateProgramWithSource(context, 3, clSourceData[:], clSourceLengths[:], &status) requireSuccess(status, "could not compile an OpenCL kernel from source.") status = cl.CLBuildProgram(program, 1, gpuDevices, nil, nil, nil) if status != cl.CL_SUCCESS { printProgramBuildInfo(program, gpuDevice) } kernel := cl.CLCreateKernel(program, []byte("gom"), &status) requireSuccess(status, "could not create OpenCL kernel.") //--------------------------------------------------- // Step 6: Initialize OpenCL memory. //--------------------------------------------------- if verbosity >= 4 { printKernelWorkGroup(kernel, gpuDevice) } //--------------------------------------------------- // Step 7: Initialize OpenCL memory. //--------------------------------------------------- var size cl.CL_uint length := cl.CL_size_t(problemLength) pop := ga.NewPopulation(populationSize, problemLength) numBlocks := blocksPerSolution(pop) * pop.Size() dataSize := cl.CL_size_t(unsafe.Sizeof(size)) * cl.CL_size_t(numBlocks) populationData := make([]cl.CL_uint, numBlocks) offspringData := make([]cl.CL_uint, numBlocks) populationBuffer := cl.CLCreateBuffer( context, cl.CL_MEM_READ_ONLY, dataSize, nil, &status) requireSuccess(status, "could not allocate an OpenCL memory buffer.") cloneBuffer := cl.CLCreateBuffer( context, cl.CL_MEM_READ_WRITE, dataSize, nil, &status) requireSuccess(status, "could not allocate an OpenCL memory buffer.") // Maximum bound on the number of elements in the LT + node sizes. boundSum := (length*length+3*length-2)/2 + (2*length - 1) + 1 ltSize := cl.CL_size_t(unsafe.Sizeof(length)) * boundSum ltData := make([]cl.CL_uint, boundSum) ltBuffer := cl.CLCreateBuffer(context, cl.CL_MEM_READ_ONLY, ltSize, nil, &status) requireSuccess(status, "could not allocate an OpenCL memory buffer.") var dummyCLBool cl.CL_char improvsSize := cl.CL_size_t(unsafe.Sizeof(dummyCLBool)) * cl.CL_size_t(pop.Size()) improvsData := make([]cl.CL_char, pop.Size()) improvsBuffer := cl.CLCreateBuffer(context, cl.CL_MEM_WRITE_ONLY, improvsSize, nil, &status) requireSuccess(status, "could not allocate an OpenCL memory buffer.") offspringBuffer := cl.CLCreateBuffer( context, cl.CL_MEM_WRITE_ONLY, dataSize, nil, &status) requireSuccess(status, "could not allocate an OpenCL memory buffer.") //--------------------------------------------------- // Step 8: Perform GOMEA. //--------------------------------------------------- if randomSeed == 0 { rand.Seed(time.Now().Unix()) } else { rand.Seed(int64(randomSeed)) } done := false generationsPassed := 0 if verbosity >= 3 { printGeneration(0, pop) } for !done { // Build the linkage tree and upload a flattened version to the compute device. freqs := Frequencies(pop) lt := LinkageTree(pop, freqs) flattenIntoSlice(lt, ltData) requireSuccess(cl.CLEnqueueWriteBuffer( commandQueue, ltBuffer, cl.CL_TRUE, 0, ltSize, unsafe.Pointer(<Data[0]), 0, nil, nil), "could not write data to an OpenCL memory buffer.") // Store a flattened version of the population on the compute device. populationToSlice(pop, populationData) requireSuccess(cl.CLEnqueueWriteBuffer( commandQueue, populationBuffer, cl.CL_TRUE, 0, dataSize, unsafe.Pointer(&populationData[0]), 0, nil, nil), "could not write data to an OpenCL memory buffer.") // Set the GOM kernel arguments. popSize := cl.CL_uint(pop.Size()) solLength := cl.CL_uint(pop.Length()) setKernelArg(kernel, 0, &populationBuffer) setKernelArg(kernel, 1, &popSize) setKernelArg(kernel, 2, &solLength) setKernelArg(kernel, 3, &cloneBuffer) setKernelArg(kernel, 4, <Buffer) setKernelArg(kernel, 5, &improvsBuffer) setKernelArg(kernel, 6, &offspringBuffer) var globalWorkSize [1]cl.CL_size_t globalWorkSize[0] = cl.CL_size_t(pop.Size()) // Perform GOM crossover. requireSuccess(cl.CLEnqueueNDRangeKernel( commandQueue, kernel, 1, nil, globalWorkSize[:], nil, 0, nil, nil), "could not enqueue OpenCL kernel.") requireSuccess(cl.CLFinish(commandQueue), "could not finish command queue.") // Retrieve the offspring population from the compute device. requireSuccess(cl.CLEnqueueReadBuffer( commandQueue, offspringBuffer, cl.CL_TRUE, 0, dataSize, unsafe.Pointer(&offspringData[0]), 0, nil, nil), "reading a buffer failed.") requireSuccess(cl.CLEnqueueReadBuffer( commandQueue, improvsBuffer, cl.CL_TRUE, 0, improvsSize, unsafe.Pointer(&improvsData[0]), 0, nil, nil), "reading improvs buffer failed.") foundOptimal := sliceToPopulation(offspringData, pop) generationsPassed++ if (verbosity == 2 && done) || (verbosity == 3) { printGeneration(generationsPassed, pop) } // TODO: Termination Criterion if generationsPassed == numGenerations { done = true } improved := false for _, b := range improvsData { if b > 0 { improved = true break } } if !improved { if verbosity >= 2 { log.Println("Terminated after the population did not improve for one generation.") } done = true } if foundOptimal { if verbosity >= 2 { log.Printf("Optimal solution found after %d generations.\n", generationsPassed) } done = true } } }