I'm having a problem implementing a Feed-Forward MultiLayer Perceptron, with back-prop learning in OpenCL in Java, using JOCL. Here is the kernel code for the calculation phase:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void Neuron(__global const double *inputPatterns,
__global double *weights,
__global const int *numInputs,
__global const int *activation,
__global const double *bias,
__global const int *usingBias,
__global double *values,
__global const int *maxNumFloats,
__global const int *patternIndex,
__global const int *inputPatternSize,
__global const int *indexOffset,
__global const int *isInputNeuron,
__global const int *inputs)
{
int gid = get_global_id(0);
double sum = 0.0;
for(int i = 0; i < numInputs[gid+indexOffset[0]]; i++)
{
sum += values[inputs[(gid+indexOffset[0]) * maxNumFloats[0] + i]] *
weights[(gid+indexOffset[0]) * maxNumFloats[0] + i];
}
if(usingBias[gid+indexOffset[0]])
sum += bias[gid+indexOffset[0]];
if(isInputNeuron[gid+indexOffset[0]])
sum += inputPatterns[gid+indexOffset[0]+(patternIndex[0] * inputPatternSize[0])];
if(activation[gid+indexOffset[0]] == 1)
sum = 1.0 / (1.0 + exp(-sum));
values[gid + indexOffset[0]] = sum;
}
Basically, I run this kernel for each layer in the network. For the first layer, there are no "inputs", so the loop does not execute. As the first layer is an input node layer however, it does add the relevant value from the input pattern. This executes fine, and I can read back the values at this point.
When I try and run the SECOND layer however (which does have inputs, every node from the first layer), a call to clFinish() returns the error CL_INVALID_COMMAND_QUEUE. Sometimes this error is coupled with a driver crash and recovery. I have read around (here for example) that this might be a problem with TDR timeouts, and have made an attempt to raise the limit but unsure if this is making any difference.
I'm going through the calls to clSetKernelArg() to check for anything stupid, but can anyone spot anything obviously off in the code? It would seem that the error is introduced in the second layer due to the inclusion of the for loop... I can clarify any of the parameters if its needed but it seemed a bit overkill for an initial post.
Also, I'm fully aware this code will probably be an affront to competent coders everywhere, but feel free to flame :P
EDIT: Host code:
//Calc
for(int k = 0; k < GPUTickList.length; k++)
{
clFlush(clCommandQueue);
clFinish(clCommandQueue);
//If input nodes
if(k == 0)
//Set index offset to 0
GPUMapIndexOffset.asIntBuffer().put(0, 0);
else
//Update index offset
GPUMapIndexOffset.asIntBuffer().put(0,
GPUMapIndexOffset.asIntBuffer().get(0) + GPUTickList[k-1]);
//Write index offset to GPU buffer
ret = clEnqueueWriteBuffer(clCommandQueue, memObjects[12], CL_TRUE, 0,
Sizeof.cl_int, Pointer.to(GPUMapIndexOffset.position(0)), 0, null, null);
//Set work size (width of layer)
global_work_size[0] = GPUTickList[k];
ret = clEnqueueNDRangeKernel(clCommandQueue, kernel_iterate, 1,
global_work_offset, global_work_size, local_work_size,
0, null, null);
}
EDIT 2: I've uploaded the full code to pastebin.
Solved. Fixed the error by making everything indexed with [0] a straight kernel parameter, rather than a buffer. Clearly the hardware doesn't like lots of stuff accessing one particular element of a buffer at once.
I'm not sure about what you have above the loop.. do you use the queue other than in this loop? Below is something you may want to try out.
//flush + finish if you need to before the loop, otherwise remove these lines
clFlush(clCommandQueue);
clFinish(clCommandQueue);
cl_event latestEvent;
//Calc
for(int k = 0; k < GPUTickList.length; k++)
{
//If input nodes
if(k == 0)
//Set index offset to 0
GPUMapIndexOffset.asIntBuffer().put(0, 0);
else
//Update index offset
GPUMapIndexOffset.asIntBuffer().put(0,
GPUMapIndexOffset.asIntBuffer().get(0) + GPUTickList[k-1]);
//Write index offset to GPU buffer
ret = clEnqueueWriteBuffer(clCommandQueue, memObjects[12], CL_TRUE, 0,
Sizeof.cl_int, Pointer.to(GPUMapIndexOffset.position(0)), 0, null, null);
//Set work size (width of layer)
global_work_size[0] = GPUTickList[k];
ret = clEnqueueNDRangeKernel(clCommandQueue, kernel_iterate, 1,
global_work_offset, global_work_size, local_work_size,
0, null, &latestEvent);
clWaitForEvents(1, &latestEvent);
}
Also bear in mind that if I used the clFinish() rather than the clWaitForEvents(), the error would instead be CLINVALIDCOMMAND_QUEUE at this point - chrisvarnz 2012-04-04 14:07