“CUDAfy.NET is a .NET 4.0 library that allows writing of NVIDIA CUDA and (Intel/AMD/NVIDIA/Altera, etc) OpenCL applications from with .NET. There are no separate CUDA cu files or complex set-up procedures to launch GPU device functions. It follows the CUDA programming model and any knowledge gained from tutorials or books on CUDA can be easily transferred to CUDAfy, only in a clean .NET fashion.”
www.hybriddsp.com
http://www.cass-hpc.com/category/cudanet CUDA.NET has been modified by hybriddsp to support CUDA Toolkit >=3.2 (currently running 5.0)
The Translator converts .NET code into CUDA or OpenCL code.
Detailed guide on installing CUDA dev tools:
http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-microsoft-windows/index.html
latest nVidia drivers nVidia GeForce GT 550M – Should have compute of 2.1
29-Sep-12
9.18.13.694
http://www.geforce.com/drivers/results/59707
says I need 314.22 WHQL 25-Mar-13 215.41MB
**get Intel OpenCL SDK: http://software.intel.com/en-us/vcsource/tools/opencl-sdk
1.Simple_Kernel
Functions running on a device are often referred to as kernels

2.Simple_Kernel_Params
class Program
{
static void Main(string[] args)
{
simple_kernel_params.Execute();
}
}
public class simple_kernel_params
{
public static void Execute()
{
CudafyModule km = CudafyTranslator.Cudafy();
GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
gpu.LoadModule(km);
// we cannot return any value from a device function...so our result is passed via parameter c
// out keyword not supported.. so use a vector
// allocating memory on the device even though it will only contain one Int32 value
int c;
int[] dev_c = gpu.Allocate<int>(); // cudaMalloc one Int32
gpu.Launch().add(2, 7, dev_c); // or gpu.Launch(1, 1, "add", 2, 7, dev_c);
//gpu.Launch(1000, 1000, "add", 2, 7, dev_c);
// copying result back
gpu.CopyFromDevice(dev_c, out c);
Console.WriteLine("2 + 7 = {0}", c);
//gpu.Launch().sub(2, 7, dev_c);
//gpu.CopyFromDevice(dev_c, out c);
//Console.WriteLine("2 - 7 = {0}", c);
gpu.Free(dev_c);
}
[Cudafy]
public static void add(int a, int b, int[] c)
{
c[0] = a + b;
}
[Cudafy]
public static void sub(int a, int b, int[] c)
{
c[0] = a - b;
}
}
3.Enum_GPU

class Program
{
static void Main(string[] args)
{
enum_gpu.Execute();
}
}
public class enum_gpu
{
public static void Execute()
{
int i = 0;
foreach (GPGPUProperties prop in CudafyHost.GetDeviceProperties(CudafyModes.Target))
{
Console.WriteLine(" --- General Information for device {0} ---", i);
Console.WriteLine("Name: {0}", prop.Name);
Console.WriteLine("Platform Name: {0}", prop.PlatformName);
Console.WriteLine("Device Id: {0}", prop.DeviceId);
Console.WriteLine("Compute capability: {0}.{1}", prop.Capability.Major, prop.Capability.Minor);
Console.WriteLine("Clock rate: {0}", prop.ClockRate);
Console.WriteLine("Simulated: {0}", prop.IsSimulated);
Console.WriteLine();
Console.WriteLine(" --- Memory Information for device {0} ---", i);
Console.WriteLine("Total global mem: {0}", prop.TotalMemory);
Console.WriteLine("Total constant Mem: {0}", prop.TotalConstantMemory);
Console.WriteLine("Max mem pitch: {0}", prop.MemoryPitch);
Console.WriteLine("Texture Alignment: {0}", prop.TextureAlignment);
Console.WriteLine();
Console.WriteLine(" --- MP Information for device {0} ---", i);
Console.WriteLine("Shared mem per mp: {0}", prop.SharedMemoryPerBlock);
Console.WriteLine("Registers per mp: {0}", prop.RegistersPerBlock);
Console.WriteLine("Threads in warp: {0}", prop.WarpSize);
Console.WriteLine("Max threads per block: {0}", prop.MaxThreadsPerBlock);
Console.WriteLine("Max thread dimensions: ({0}, {1}, {2})", prop.MaxThreadsSize.x,
prop.MaxThreadsSize.y, prop.MaxThreadsSize.z);
Console.WriteLine("Max grid dimensions: ({0}, {1}, {2})", prop.MaxGridSize.x, prop.MaxGridSize.y,
prop.MaxGridSize.z);
Console.WriteLine();
i++;
}
}
}
4.Add_Loop_GPU
CPU was interesting too (see other article) as it passes a reference type by value.
public class add_loop_gpu
{
public const int N = 10;
public static void Execute()
{
CudafyModule km = CudafyTranslator.Cudafy();
GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
gpu.LoadModule(km);
int[] a = new int[N];
int[] b = new int[N];
int[] c = new int[N];
// allocate the memory on the GPU
int[] dev_a = gpu.Allocate<int>(a);
int[] dev_b = gpu.Allocate<int>(b);
int[] dev_c = gpu.Allocate<int>(c);
// fill the arrays 'a' and 'b' on the CPU
for (int i = 0; i < N; i++)
{
a[i] = -i;
b[i] = i * i;
}
// copy the arrays 'a' and 'b' to the GPU
gpu.CopyToDevice(a, dev_a);
gpu.CopyToDevice(b, dev_b);
// launch add on N threads.. ie 10 threads
gpu.Launch(N, 1).adder(dev_a, dev_b, dev_c);
// copy the array 'c' back from the GPU to the CPU
gpu.CopyFromDevice(dev_c, c);
// display the results
for (int i = 0; i < N; i++)
{
Console.WriteLine("{0} + {1} = {2}", a[i], b[i], c[i]);
}
// free the memory allocated on the GPU
gpu.Free(dev_a);
gpu.Free(dev_b);
gpu.Free(dev_c);
}
[Cudafy]
public static void adder(GThread thread, int[] a, int[] b, int[] c)
{
// tid will be a number between 0 and N
// each thread only adds 2 numbers (elements) together
// yet is being passed the entire array
int tid = thread.blockIdx.x;
if (tid < N)
c[tid] = a[tid] + b[tid];
}
128 threads
need to add 32768 elements
each thread doing 256 elements
so need to jump128 from each starting point
GPU going flat out, CPU’s idling.
public class add_loop_long
{
public const int N = 65536;
public static void Execute()
{
CudafyModule km = CudafyTranslator.Cudafy();
GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
gpu.LoadModule(km);
int[] a = new int[N];
int[] b = new int[N];
int[] c = new int[N];
// allocate the memory on the GPU
int[] dev_a = gpu.Allocate<int>(a);
int[] dev_b = gpu.Allocate<int>(b);
int[] dev_c = gpu.Allocate<int>(c);
// fill the arrays 'a' and 'b' on the CPU
for (int i = 0; i < N; i++)
{
a[i] = i;
b[i] = 2 * i;
}
// copy the arrays 'a' and 'b' to the GPU
gpu.CopyToDevice(a, dev_a);
gpu.CopyToDevice(b, dev_b);
// launch add on N threads.. ie 10 threads
//gpu.Launch(N, 1).adder(dev_a, dev_b, dev_c);
// 128 threads
for (int i = 0; i < 1000; i++)
{
gpu.Launch(128, 1).add(dev_a, dev_b, dev_c);
}
// copy the array 'c' back from the GPU to the CPU
gpu.CopyFromDevice(dev_c, c);
// verify that the GPU did the work we requested
bool success = true;
for (int i = 0; i < N; i++)
{
if ((a[i] + b[i]) != c[i])
{
Console.WriteLine("{0} + {1} != {2}", a[i], b[i], c[i]);
success = false;
break;
}
Console.WriteLine("{0} + {1} = {2}", a[i], b[i], c[i]);
}
if (success)
Console.WriteLine("We did it!");
// free the memory allocated on the GPU
gpu.Free(dev_a);
gpu.Free(dev_b);
gpu.Free(dev_c);
}
[Cudafy]
public static void add(GThread thread, int[] a, int[] b, int[] c)
{
// each thread responsible for N/128 ie 256 elements (32768)
// each thread responsible for N/128 ie 512 elements (65536)
// tid (threadID) will be a number between 0 and 128 ie starting point
int tid = thread.blockIdx.x;
while (tid < N)
{
c[tid] = a[tid] + b[tid];
// jump how many elements (128)
tid += thread.gridDim.x;
}
}