maxIterations = 500;
gridSize = 1000;
rlim = [-0.748766713922161, -0.748766707771757];
ilim = [ 0.123640844894862,  0.123640851045266];

% Setup
t = tic();
x = linspace( rlim(1), rlim(2), gridSize );
y = linspace( ilim(1), ilim(2), gridSize );
[xGrid,yGrid] = meshgrid( x, y );
z0 = xGrid + 1i*yGrid;
count = zeros( size(z0) );

% Calculate
z = z0;
for n = 0:maxIterations
    z = z.*z + z0;
    inside = abs( z )<=2;
    count = count + inside;
end
count = log( count+1 );

% Show
cpuTime = toc( t );
%set( gcf, 'Position', [200 200 600 600] );
%imagesc( x, y, count );
%axis image
%colormap( [jet();flipud( jet() );0 0 0] );
%title( sprintf( '%1.2fsecs (without GPU)', cpuTime ) );
fprintf( '%1.2fsecs (without GPU)\n', cpuTime );

%% 

% Setup
t = tic();
x = gpuArray.linspace( rlim(1), rlim(2), gridSize );
y = gpuArray.linspace( ilim(1), ilim(2), gridSize );
[xGrid,yGrid] = meshgrid( x, y );
z0 = complex( xGrid, yGrid );
count = gpuArray.zeros( size(z0) );

% Calculate
z = z0;
for n = 0:maxIterations
    z = z.*z + z0;
    inside = abs( z )<=2;
    count = count + inside;
end
count = log( count+1 );

% Show
naiveGPUTime = toc( t );
%imagesc( x, y, count )
%axis image
%title( sprintf( '%1.2fsecs (naive GPU) = %1.1fx faster', ...
%    naiveGPUTime, cpuTime/naiveGPUTime ) )

fprintf( '%1.2fsecs (with GPU but naively)\n', naiveGPUTime );


% Setup
t = tic();
x = gpuArray.linspace( rlim(1), rlim(2), gridSize );
y = gpuArray.linspace( ilim(1), ilim(2), gridSize );
[xGrid,yGrid] = meshgrid( x, y );

% Calculate
count = arrayfun( @processMandelbrotElement, xGrid, yGrid, maxIterations );

% Show
gpuArrayfunTime = toc( t );
%imagesc( x, y, count )
%axis image
%title( sprintf( '%1.2fsecs (GPU arrayfun) = %1.1fx faster', ...
%    gpuArrayfunTime, cpuTime/gpuArrayfunTime ) );

fprintf( '%1.2fsecs (with GPU smartly)\n', gpuArrayfunTime );


%%
t = tic();
x = gpuArray.linspace( rlim(1), rlim(2), gridSize );
y = gpuArray.linspace( ilim(1), ilim(2), gridSize );
[xGrid,yGrid] = meshgrid( x, y );

% Load the kernel
kernel = parallel.gpu.CUDAKernel( 'processMandelbrotElement.ptxa64', ...
                                  'processMandelbrotElement.cu' );

% Make sure we have sufficient blocks to cover the whole array
numElements = numel( xGrid );
kernel.ThreadBlockSize = [kernel.MaxThreadsPerBlock,1,1];
kernel.GridSize = [ceil(numElements/kernel.MaxThreadsPerBlock),1];

% Call the kernel
count = gpuArray.zeros( size(xGrid) );
count = feval( kernel, count, xGrid, yGrid, maxIterations, numElements, 1 );

% Show
gpuCUDAKernelTime = toc( t );
%imagesc( x, y, count )
%axis image
%title( sprintf( '%1.2fsecs (GPU CUDAKernel) = %1.1fx faster', ...
%    gpuCUDAKernelTime, cpuTime/gpuCUDAKernelTime ) );

fprintf( '%1.2fsecs (with GPU super-smartly)\n', gpuCUDAKernelTime );
