tv-image-denoising/src/tv_denoising.cu

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <opencv2/opencv.hpp>

#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16

__global__ void tvDenoisingKernel(float* image, int width, int height, float lambda, int maxIterations)
{
    // Calculate the global thread index
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int index = row * width + col;

     // Declare shared memory arrays
     __shared__ float gradientX[BLOCK_SIZE_X][BLOCK_SIZE_Y];
     __shared__ float gradientY[BLOCK_SIZE_X][BLOCK_SIZE_Y];
     __shared__ float updatedImage[BLOCK_SIZE_X][BLOCK_SIZE_Y];

     // Perform TV denoising iteratively
     for (int iteration = 0; iteration < maxIterations; ++iteration)
     {
         // Calculate the gradients using central differences
        gradientX[threadIdx.x][threadIdx.y] = image[index + 1] - image[index - 1];
        gradientY[threadIdx.x][threadIdx.y] = image[index + width] - image[index - width];

        // Synchronize threads to ensure all gradient calculations are complete
        __syncthreads();

        // Apply TV denoising update rule
        updatedImage[threadIdx.x][threadIdx.y] = image[index] + lambda * (
            gradientX[threadIdx.x][threadIdx.y] - gradientX[threadIdx.x - 1][threadIdx.y] +
            gradientY[threadIdx.x][threadIdx.y] - gradientY[threadIdx.x][threadIdx.y - 1]
        );

        // Update the global image array with the updated pixel values
        image[index] = updatedImage[threadIdx.x][threadIdx.y];

        // Synchronize threads to ensure all image updates are complete
        __syncthreads();
     }
}

extern "C" void TVDenoising(cv::Mat& image, float lambda, int maxIterations)
{
    // Convert the image to float precision
    cv::Mat floatImage;
    image.convertTo(floatImage, CV_32F);

    // Get image dimensions
    int width = image.cols;
    int height = image.rows;

    // Calculate the number of blocks and threads per block
    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);

    // Allocate GPU memory for the image
    float* d_image;
    cudaMalloc(&d_image, width * height * sizeof(float));

    // Copy the image data from host to device
    cudaMemcpy(d_image, floatImage.ptr<float>(0), width * height * sizeof(float), cudaMemcpyHostToDevice);

    // Invoke the TV denoising kernel
    tvDenoisingKernel<<<gridSize, blockSize>>>(d_image, width, height, lambda, maxIterations);

    // Copy the denoised image data back from device to host
    cudaMemcpy(floatImage.ptr<float>(0), d_image, width * height * sizeof(float), cudaMemcpyDeviceToHost);

    // Convert the denoised image back to the original data type
    floatImage.convertTo(image, image.type());

    // Free the GPU memory
    cudaFree(d_image);
}