A Jump Start to OpenCLMarch 15, 2009CIS 565/665 – GPU Computing and ArchitectureAnother Language to Program Parallel Computing DevicesSources• OpenCL Tutorial - Introduction to OpenCL• OpenCL for NVIDIA GPUs – Chris Lamb• OpenCL – Parallel Computing for Heterogeneous Devices (SIGGASIA) –Kronos Group• NVIDIA OpenCL Jump Start Guide• OpenCL – Making Use of What You’ve Got• OpenCL Basics and Advanced (PPAM 2009) – Domink BehrSources• OpenCL Tutorial - Introduction to OpenCL• OpenCL for NVIDIA GPUs – Chris Lamb• OpenCL – Parallel Computing for Heterogeneous Devices (SIGGASIA) –Kronos Group• NVIDIA OpenCL Jump Start Guide• OpenCL – Making Use of What You’ve Got• OpenCL Basics and Advanced (PPAM 2009) – Domink BehrCUDA Working Group• Because of Nexus and Visual Studio Integration….Anatomy of OpenCL• Language Specification• C-based cross-platform programming interface• Subset of ISO C99 with language extensions - familiar to developers• Well-defined numerical accuracy (IEEE 754 rounding with specified max error)• Online or offline compilation and build of compute kernel executables• Includes a rich set of built-in functions• Platform Layer API• A hardware abstraction layer over diverse computational resources• Query, select and initialize compute devices• Create compute contexts and work-queues• Runtime API• Execute compute kernels• Manage scheduling, compute, and memory resourcesCUDAMemory Model ComparisonOpenCL CUDACUDA vs OpenCLArchitecture – Execution Model• Kernel – Smallest unit of execution, like a C function• Host program – A collection of kernels• Work item, an instance of kernel at run time• Work group, a collection of work itemsCommand QueuesCUDA vs OpenCL API Differences• Naming Schemes • How data gets passes to the API• C for CUDA programs are compiled with an external tool (NVCC compiler)• OpenCL compiler it typically invoked at runtime (you can offline compile too)CUDAOpenCLcuInit(0); cuDeviceGet(&hContext, 0); cuCtxCreate(&hContext, 0, hDevice)); CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC; cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float)); cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float)); cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float)); // copy host vectors to device cuMemcpyHtoD(pDeviceMemA, pA, cnDimension* sizeof(float)); cuMemcpyHtoD(pDeviceMemB, pB, cnDimension* sizeof(float)); cuFuncSetBlockShape(cuFunction, cnBlockSize, 1, 1); cuLaunchGrid (cuFunction, cnBlocks, 1); cl_context hContext; hContext = clCreateContextFromType(0, CL_DEVICE_DEVICE_TYPE_GPU, 0,0,0);cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC; hDeviceMemA = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0); hDeviceMemB = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0); hDeviceMemC = clCreateBuffer(hContext, CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float) 0, 0);clEnqueueNDRangeKernel(hCmdQueue, hKernel, 1, 0, &cnDimension, &cnBlockSize, 0, 0, 0);CUDA Pointer Traversalstruct Node { Node* next; } n = n->next; // undefined operation in OpenCL, // since ‘n’ here is a kernel inputOpenCL Pointer Traversalstruct Node { unsigned int next; } …n = bufBase + n; // pointer arithmetic is fine, bufBase is // a kernel input param to the buffer’s beginningCUDA Kernel code:__global__ void vectorAdd(const float * a, const float * b, float * c) { // Vector element index int nIndex = blockIdx.x * blockDim.x + threadIdx.x; c[nIndex] = a[nIndex] + b[nIndex]; }OpenCL Kernel code:__kernel void vectorAdd(__global const float * a, __global const float * b, __global float * c) { // Vector element index int nIndex = get_global_id(0); c[nIndex] = a[nIndex] + b[nIndex]; } CUDA kernel functions are declared using the “__global__”function modifierOpenCL kernel functions are declared using “__kernel”.CUDA Driver API Host code:const unsigned int cnBlockSize = 512; const unsigned int cnBlocks = 3; const unsigned int cnDimension = cnBlocks * cnBlockSize; CUdevice hDevice; CUcontext hContext; CUmodule hModule; CUfunction hFunction; // create CUDA device & context cuInit(0); cuDeviceGet(&hContext, 0); // pick first device cuCtxCreate(&hContext, 0, hDevice)); cuModuleLoad(&hModule, “vectorAdd.cubin”); cuModuleGetFunction(&hFunction, hModule, "vectorAdd"); // allocate host vectors float * pA = new float[cnDimension]; float * pB = new float[cnDimension]; float * pC = new float[cnDimension]; // initialize host memory randomInit(pA, cnDimension); randomInit(pB, cnDimension); // allocate memory on the device CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC; cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float)); cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float)); cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float)); // copy host vectors to device cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float)); cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float)); // setup parameter values cuFuncSetBlockShape(cuFunction, cnBlockSize, 1, 1); cuParamSeti(cuFunction, 0, pDeviceMemA); cuParamSeti(cuFunction, 4, pDeviceMemB); cuParamSeti(cuFunction, 8, pDeviceMemC); cuParamSetSize(cuFunction, 12); // execute kernel cuLaunchGrid(cuFunction, cnBlocks, 1); // copy the result from device back to host cuMemcpyDtoH((void *) pC, pDeviceMemC, cnDimension * sizeof(float)); delete[] pA; delete[] pB; delete[] pC; cuMemFree(pDeviceMemA); cuMemFree(pDeviceMemB); cuMemFree(pDeviceMemC);OpenCL Host Code:const unsigned int cnBlockSize = 512; const unsigned int cnBlocks = 3; const unsigned int cnDimension = cnBlocks * cnBlockSize; // create OpenCL device & context cl_context hContext; hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, 0, 0, 0); // query all devices available to the context size_t nContextDescriptorSize; clGetContextInfo(hContext, CL_CONTEXT_DEVICES, 0, 0, &nContextDescriptorSize); cl_device_id * aDevices = malloc(nContextDescriptorSize); clGetContextInfo(hContext, CL_CONTEXT_DEVICES, nContextDescriptorSize, aDevices, 0); // create a command queue for first device the context reported cl_command_queue hCmdQueue; hCmdQueue = clCreateCommandQueue(hContext, aDevices[0], 0, 0); // create & compile program cl_program hProgram; hProgram = clCreateProgramWithSource(hContext, 1, sProgramSource, 0, 0); clBuildProgram(hProgram, 0, 0, 0, 0, 0); // create kernel cl_kernel hKernel; hKernel =
View Full Document