From 20d887293037106fc22b23fcdde87d755c54ac4a Mon Sep 17 00:00:00 2001
From: Vargha Csongor <Culisa1023@gmail.com>
Date: Sun, 25 Jun 2023 14:34:54 +0200
Subject: [PATCH] Add Cuda kernel, to perform denoising

---
 src/tv_denoising.cu | 48 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)
diff --git a/src/tv_denoising.cu b/src/tv_denoising.cu
index b312b45..d5756c3 100644
--- a/src/tv_denoising.cu
+++ b/src/tv_denoising.cu
@@ -3,6 +3,44 @@
 #include <device_launch_parameters.h>
 #include <opencv2/opencv.hpp>
 
+#define BLOCK_SIZE_X 16
+#define BLOCK_SIZE_Y 16
+
+__global__ void tvDenoisingKernel(float* image, int width, int height, float lambda, int maxIterations)
+{
+    // Calculate the global thread index
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int index = row * width + col;
+
+     // Declare shared memory arrays
+     __shared__ float gradientX[BLOCK_SIZE_X][BLOCK_SIZE_Y];
+     __shared__ float gradientY[BLOCK_SIZE_X][BLOCK_SIZE_Y];
+     __shared__ float updatedImage[BLOCK_SIZE_X][BLOCK_SIZE_Y];
+ 
+     // Perform TV denoising iteratively
+     for (int iteration = 0; iteration < maxIterations; ++iteration)
+     {
+         // Calculate the gradients using central differences
+        gradientX[threadIdx.x][threadIdx.y] = image[index + 1] - image[index - 1];
+        gradientY[threadIdx.x][threadIdx.y] = image[index + width] - image[index - width];
+
+        // Synchronize threads to ensure all gradient calculations are complete
+        __syncthreads();
+
+        // Apply TV denoising update rule
+        updatedImage[threadIdx.x][threadIdx.y] = image[index] + lambda * (
+            gradientX[threadIdx.x][threadIdx.y] - gradientX[threadIdx.x - 1][threadIdx.y] +
+            gradientY[threadIdx.x][threadIdx.y] - gradientY[threadIdx.x][threadIdx.y - 1]
+        );
+
+        // Update the global image array with the updated pixel values
+        image[index] = updatedImage[threadIdx.x][threadIdx.y];
+
+        // Synchronize threads to ensure all image updates are complete
+        __syncthreads();
+     }
+}
 
 extern "C" void TVDenoising(cv::Mat& image, float lambda, int maxIterations)
 {
@@ -25,7 +63,15 @@ extern "C" void TVDenoising(cv::Mat& image, float lambda, int maxIterations)
     // Copy the image data from host to device
     cudaMemcpy(d_image, floatImage.ptr<float>(0), width * height * sizeof(float), cudaMemcpyHostToDevice);
 
- 
+    // Invoke the TV denoising kernel
+    tvDenoisingKernel<<<gridSize, blockSize>>>(d_image, width, height, lambda, maxIterations);
+
+    // Copy the denoised image data back from device to host
+    cudaMemcpy(floatImage.ptr<float>(0), d_image, width * height * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Convert the denoised image back to the original data type
+    floatImage.convertTo(image, image.type());
+
     // Free the GPU memory
     cudaFree(d_image);
 }