So, after spending hours reading and understanding I have finally made my first OpenCL program that actually does something, which is it adds two vectors and outputs to a file.
#include <iostream>
#include <vector>
#include <cstdlib>
#include <string>
#include <fstream>
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
int main(int argc, char *argv[])
{
try
{
// get platforms, devices and display their info.
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::vector<cl::Platform>::iterator i=platforms.begin();
std::cout<<"OpenCL \tPlatform : "<<i->getInfo<CL_PLATFORM_NAME>()<<std::endl;
std::cout<<"\tVendor: "<<i->getInfo<CL_PLATFORM_VENDOR>()<<std::endl;
std::cout<<"\tVersion : "<<i->getInfo<CL_PLATFORM_VERSION>()<<std::endl;
std::cout<<"\tExtensions : "<<i->getInfo<CL_PLATFORM_EXTENSIONS>()<<std::endl;
// get devices
std::vector<cl::Device> devices;
i->getDevices(CL_DEVICE_TYPE_ALL,&devices);
int o=99;
std::cout<<"\n\n";
// iterate over available devices
for(std::vector<cl::Device>::iterator j=devices.begin(); j!=devices.end(); j++)
{
std::cout<<"\tOpenCL\tDevice : " << j->getInfo<CL_DEVICE_NAME>()<<std::endl;
std::cout<<"\t\t Type : " << j->getInfo<CL_DEVICE_TYPE>()<<std::endl;
std::cout<<"\t\t Vendor : " << j->getInfo<CL_DEVICE_VENDOR>()<<std::endl;
std::cout<<"\t\t Driver : " << j->getInfo<CL_DRIVER_VERSION>()<<std::endl;
std::cout<<"\t\t Global Mem : " << j->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()/(1024*1024)<<" MBytes"<<std::endl;
std::cout<<"\t\t Local Mem : " << j->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()/1024<<" KBbytes"<<std::endl;
std::cout<<"\t\t Compute Unit : " << j->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()<<std::endl;
std::cout<<"\t\t Clock Rate : " << j->getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>()<<" MHz"<<std::endl;
}
std::cout<<"\n\n\n";
//MAIN CODE BEGINS HERE
//get Kernel
std::ifstream ifs("vector_add_kernel.cl");
std::string kernelSource((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
std::cout<<kernelSource;
//Create context, select device and command queue.
cl::Context context(devices);
cl::Device &device=devices.front();
cl::CommandQueue cmdqueue(context,device);
// Generate Source vector and push the kernel source in it.
cl::Program::Sources sourceCode;
sourceCode.push_back(std::make_pair(kernelSource.c_str(), kernelSource.size()));
//Generate program using sourceCode
cl::Program program=cl::Program(context, sourceCode);
//Build program..
try
{
program.build(devices);
}
catch(cl::Error &err)
{
std::cerr<<"Building failed, "<<err.what()<<"("<<err.err()<<")"
<<"\nRetrieving build log"
<<"\n Build Log Follows \n"
<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices.front());
}
//Declare and initialize vectors
std::vector<cl_float>B(993448,1.3);
std::vector<cl_float>C(993448,1.3);
std::vector<cl_float>A(993448,1.3);
cl_int N=A.size();
//Declare and intialize proper work group size and global size. Global size raised to the nearest multiple of workGroupSize.
int workGroupSize=128;
int GlobalSize;
if(N%workGroupSize) GlobalSize=N - N%workGroupSize + workGroupSize;
else GlobalSize=N;
//Declare buffers.
cl::Buffer vecA(context, CL_MEM_READ_WRITE, sizeof(cl_float)*N);
cl::Buffer vecB(context, CL_MEM_READ_ONLY , (B.size())*sizeof(cl_float));
cl::Buffer vecC(context, CL_MEM_READ_ONLY , (C.size())*sizeof(cl_float));
//Write vectors into buffers
cmdqueue.enqueueWriteBuffer(vecB, 0, 0, (B.size())*sizeof(cl_float), &B[0] );
cmdqueue.enqueueWriteBuffer(vecB, 0, 0, (C.size())*sizeof(cl_float), &C[0] );
//Executing kernel
cl::Kernel kernel(program, "vector_add");
cl::KernelFunctor kernel_func=kernel.bind(cmdqueue, cl::NDRange(GlobalSize), cl::NDRange(workGroupSize));
kernel_func(vecA, vecB, vecC, N);
//Reading back values into vector A
cmdqueue.enqueueReadBuffer(vecA,true,0,N*sizeof(cl_float), &A[0]);
cmdqueue.finish();
//Saving into file.
std::ofstream output("vectorAdd.txt");
for(int i=0;i<N;i++) output<<A[i]<<"\n";
}
catch(cl::Error& err)
{
std::cerr << "OpenCL error: " << err.what() << "(" << err.err() <<
")" << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
The problem is, for smaller values of N, I'm getting the correct result that is 2.6 But for larger values, like the one in the code above (993448) I get garbage output varying between 1 and 2.4.
Here is the Kernel code :
__kernel void vector_add(__global float *A, __global float *B, __global float *C, int N) {
// Get the index of the current element
int i = get_global_id(0);
//Do the operation
if(i<N) A[i] = C[i] + B[i];
}
UPDATE : Ok it seems the code is working now. I have fixed a few minor mistakes in my code above 1) The part where GlobalSize is initialized has been fixed. 2)Stupid mistake in enqueueWriteBuffer (wrong parameters given) It is now outputting the correct result for large values of N.
Try to change the data type from float to double etc.