Remove existing CUDA installation and NVIDIA drivers (optional)

# !dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
# !apt-get remove cuda-*
# !apt autoremove
# !apt-get update

Install specific CUDA (optional)

# !dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
# !apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
# !apt-get update
# !apt-get install cuda-9.2

Installation check

!nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0

Install a jupyter extension

!pip install git+git://github.com/j143/nvcc4jupyter.git
Collecting git+git://github.com/j143/nvcc4jupyter.git
  Cloning git://github.com/j143/nvcc4jupyter.git to /tmp/pip-req-build-bmsddg1m
  Running command git clone -q git://github.com/j143/nvcc4jupyter.git /tmp/pip-req-build-bmsddg1m
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... done
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=8e5c1d33f097359faac19a33ddab1dc8b0c99dcd1957490a2c6652f8b0a6e62a
  Stored in directory: /tmp/pip-ephem-wheel-cache-40gt56am/wheels/26/f6/b6/abe58d118498a098d0c925b2011902a9f1b4a50629ef215768
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2

Load the plugin

%load_ext nvcc_plugin
created output directory at /content/src
Out bin /content/result.out

Run CUDA

Run a print example

%%cu
#include <iostream>

int main() {
    std::cout << "This is from CUDA\n";
    return 0;
}
This is from CUDA

Involved example

int('0b100', base=0)
4
%%cu
#include <cstdio>
#include <iostream>

using namespace std;

__global__ void maxi(int* a, int* b, int n) {
	int block = 256 * blockIdx.x;
	int max = 0;

	for (int i = block; i < min(256 + block, n); i++) {
		if (max < a[i]) {
			max = a[i];
		}
	}
	b[blockIdx.x] = max;
}

int main() {

	int n;
	n = 3 >> 2;
	int a[n];

	for (int i = 0; i < n; i++) {
		a[i] = rand() % n;
		cout << a[i] << "\t";
	}

	cudaEvent_t start, end;
	int *ad, *bd;
	int size = n * sizeof(int);
	cudaMalloc(&ad, size);
	cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice);
	int grids = ceil(n * 1.0f / 256.0f);
	cudaMalloc(&bd, grids * sizeof(int));

	dim3 grid(grids, 1);
	dim3 block(1, 1);

	cudaEventCreate(&start);
	cudaEventCreate(&end);
	cudaEventRecord(start);

	while (n > 1) {
		maxi<<<grids, block>>>(ad, bd, n);
		n = ceil(n * 1.0f / 256.0f);
		cudaMemcpy(ad, bd, n * sizeof(int), cudaMemcpyDeviceToDevice);
	}

	cudaEventRecord(end);
	cudaEventSynchronize(end);

	float time = 0;
	cudaEventElapsedTime(&time, start, end);

	int ans[2];
	cudaMemcpy(ans, ad, 4, cudaMemcpyDeviceToHost);

	cout << "The maximum element is : " << ans[0] << endl;

	cout << "The time required : ";
	cout << time << endl;
}
The maximum element is : -1365408560
The time required : 0.003264

Example 1

%%cu

#include <stdio.h>

// This is a special function that runs on the GPU (device) instead of the CPU (host)
__global__ void kernel() {
  printf("Hello world!\n");
}

int main() {
  // Invoke the kernel function on the GPU with one block of one thread
  kernel<<<1,1>>>();

  // Check for error codes (remember to do this for _every_ CUDA function)
  if(cudaDeviceSynchronize() != cudaSuccess) {
    fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(cudaPeekAtLastError()));
  }

  return 0;
}
Hello world!

Example 2

%%cu

#include <stdio.h>

// This kernel runs on the GPU and prints the thread's identifiers
__global__ void kernel() {
  printf("Hello from block %d thread %d\n", blockIdx.x, threadIdx.x);
}

int main() {
  // Launch the kernel on the GPU with four blocks of six threads each
  kernel<<<4,2>>>();

  // Check for CUDA errors
  if(cudaDeviceSynchronize() != cudaSuccess) {
    fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(cudaPeekAtLastError()));
  }
  return 0;
}
Hello from block 2 thread 0
Hello from block 2 thread 1
Hello from block 0 thread 0
Hello from block 0 thread 1
Hello from block 3 thread 0
Hello from block 3 thread 1
Hello from block 1 thread 0
Hello from block 1 thread 1

Example 3

%%cu

#include <stdint.h>
#include <stdio.h>

#define N 32
#define THREADS_PER_BLOCK 32

__global__ void saxpy(float a, float* x, float* y) {
  // Which index of the array should this thread use?
  size_t index = 20;

  // Compute a times x plus y for a specific index
  y[index] = a * x[index] + y[index];Z
}

int main() {
  // Allocate arrays for X and Y on the CPU. This memory is only usable on the CPU
  float* cpu_x = (float*)malloc(sizeof(float) * N);
  float* cpu_y = (float*)malloc(sizeof(float) * N);

  // Initialize X and Y
  int i;
  for(i=0; i<N; i++) {
    cpu_x[i] = (float)i;
    cpu_y[i] = 0.0;
  }

  // The gpu_x and gpu_y pointers will only be usable on the GPU (which uses separate memory)
  float* gpu_x;
  float* gpu_y;

  // Allocate space for the x array on the GPU
  if(cudaMalloc(&gpu_x, sizeof(float) * N) != cudaSuccess) {
    fprintf(stderr, "Failed to allocate X array on GPU\n");
    exit(2);
  }

  // Allocate space for the y array on the GPU
  if(cudaMalloc(&gpu_y, sizeof(float) * N) != cudaSuccess) {
    fprintf(stderr, "Failed to allocate Y array on GPU\n");
    exit(2);
  }

  // Copy the cpu's x array to the gpu with cudaMemcpy
  if(cudaMemcpy(gpu_x, cpu_x, sizeof(float) * N, cudaMemcpyHostToDevice) != cudaSuccess) {
    fprintf(stderr, "Failed to copy X to the GPU\n");
  }

  // Copy the cpu's y array to the gpu with cudaMemcpy
  if(cudaMemcpy(gpu_y, cpu_y, sizeof(float) * N, cudaMemcpyHostToDevice) != cudaSuccess) {
    fprintf(stderr, "Failed to copy Y to the GPU\n");
  }

  // Calculate the number of blocks to run, rounding up to include all threads
  size_t blocks = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

  // Run the saxpy kernel
  saxpy<<<blocks, THREADS_PER_BLOCK>>>(0.5, gpu_x, gpu_y);

  // Wait for the kernel to finish
  if(cudaDeviceSynchronize() != cudaSuccess) {
    fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(cudaPeekAtLastError()));
  }

  // Copy the y array back from the gpu to the cpu
  if(cudaMemcpy(cpu_y, gpu_y, sizeof(float) * N, cudaMemcpyDeviceToHost) != cudaSuccess) {
    fprintf(stderr, "Failed to copy Y from the GPU\n");
  }

  // Print the updated y array
  for(i=0; i<N; i++) {
    printf("%d: %f\n", i, cpu_y[i]);
  }

  cudaFree(gpu_x);
  cudaFree(gpu_y);
  free(cpu_x);
  free(cpu_y);

  return 0;
}
/tmp/tmpzza2hszf/5bd0fb96-f0fd-4537-a788-0ddff5743a19.cu(13): error: identifier "Z" is undefined

/tmp/tmpzza2hszf/5bd0fb96-f0fd-4537-a788-0ddff5743a19.cu(14): error: expected a ";"

2 errors detected in the compilation of "/tmp/tmpzza2hszf/5bd0fb96-f0fd-4537-a788-0ddff5743a19.cu".