#include #include #include #include #define BLOCK_SIZE_X 16 #define BLOCK_SIZE_Y 16 __global__ void tvDenoisingKernel(float* image, int width, int height, float lambda, int maxIterations) { // Calculate the global thread index int col = blockIdx.x * blockDim.x + threadIdx.x; int row = blockIdx.y * blockDim.y + threadIdx.y; int index = row * width + col; // Declare shared memory arrays __shared__ float gradientX[BLOCK_SIZE_X][BLOCK_SIZE_Y]; __shared__ float gradientY[BLOCK_SIZE_X][BLOCK_SIZE_Y]; __shared__ float updatedImage[BLOCK_SIZE_X][BLOCK_SIZE_Y]; // Perform TV denoising iteratively for (int iteration = 0; iteration < maxIterations; ++iteration) { // Calculate the gradients using central differences gradientX[threadIdx.x][threadIdx.y] = image[index + 1] - image[index - 1]; gradientY[threadIdx.x][threadIdx.y] = image[index + width] - image[index - width]; // Synchronize threads to ensure all gradient calculations are complete __syncthreads(); // Apply TV denoising update rule updatedImage[threadIdx.x][threadIdx.y] = image[index] + lambda * ( gradientX[threadIdx.x][threadIdx.y] - gradientX[threadIdx.x - 1][threadIdx.y] + gradientY[threadIdx.x][threadIdx.y] - gradientY[threadIdx.x][threadIdx.y - 1] ); // Update the global image array with the updated pixel values image[index] = updatedImage[threadIdx.x][threadIdx.y]; // Synchronize threads to ensure all image updates are complete __syncthreads(); } } extern "C" void TVDenoising(cv::Mat& image, float lambda, int maxIterations) { // Convert the image to float precision cv::Mat floatImage; image.convertTo(floatImage, CV_32F); // Get image dimensions int width = image.cols; int height = image.rows; // Calculate the number of blocks and threads per block dim3 blockSize(16, 16); dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y); // Allocate GPU memory for the image float* d_image; cudaMalloc(&d_image, width * height * sizeof(float)); // Copy the image data from host to device cudaMemcpy(d_image, floatImage.ptr(0), width * height * sizeof(float), cudaMemcpyHostToDevice); // Invoke the TV denoising kernel tvDenoisingKernel<<>>(d_image, width, height, lambda, maxIterations); // Copy the denoised image data back from device to host cudaMemcpy(floatImage.ptr(0), d_image, width * height * sizeof(float), cudaMemcpyDeviceToHost); // Convert the denoised image back to the original data type floatImage.convertTo(image, image.type()); // Free the GPU memory cudaFree(d_image); }