maxIterations = 500;
gridSize = 1000;
rlim = [-0.748766713922161, -0.748766707771757];
ilim = [ 0.123640844894862,  0.123640851045266];


t = tic();
x = gpuArray.linspace( rlim(1), rlim(2), gridSize );
y = gpuArray.linspace( ilim(1), ilim(2), gridSize );
[xGrid,yGrid] = meshgrid( x, y );

% Load the kernel
kernel = parallel.gpu.CUDAKernel( 'processMandelbrotElement.ptxa64', ...
                                  'processMandelbrotElement.cu' );

% Make sure we have sufficient blocks to cover the whole array
numElements = numel( xGrid );
kernel.ThreadBlockSize = [kernel.MaxThreadsPerBlock,1,1];
kernel.GridSize = [ceil(numElements/kernel.MaxThreadsPerBlock),1];

% Call the kernel
count = gpuArray.zeros( size(xGrid) );
count = feval( kernel, count, xGrid, yGrid, maxIterations, numElements, 1 );

% Show
gpuCUDAKernelTime = toc( t );
%imagesc( x, y, count )
%axis image
%title( sprintf( '%1.2fsecs (GPU CUDAKernel) = %1.1fx faster', ...
%    gpuCUDAKernelTime, cpuTime/gpuCUDAKernelTime ) );

fprintf( '%1.2fsecs (with GPU super-smartly)\n', gpuCUDAKernelTime );
