Merge pull request #140 from AcademySoftwareFoundation/gpu-gain-openc…

…l-images Add OpenCL Image support to the updated GPU rendering extension:
AcademySoftwareFoundation · Feb 24, 2024 · e18fd49 · e18fd49
2 parents b9ded56 + db1c87a
commit e18fd49
Show file tree

Hide file tree

Showing 7 changed files with 380 additions and 115 deletions.
diff --git a/Examples/GPUGain/GPUGain.cpp b/Examples/GPUGain/GPUGain.cpp
@@ -12,7 +12,7 @@
 
 #define kPluginName "GPU Gain"
 #define kPluginGrouping "OFX Example"
-#define kPluginDescription "Apply separate RGB gain adjustments to each channels; CUDA/OpenCL/Metal"
+#define kPluginDescription "Apply separate RGB gain adjustments to each channels; CUDA/OpenCL Buffers/OpenCL Images/Metal"
 #define kPluginIdentifier "com.OpenFXSample.GPUGain"
 #define kPluginVersionMajor 1
 #define kPluginVersionMinor 0
@@ -28,7 +28,7 @@ class GainExample : public OFX::ImageProcessor
 public:
     explicit GainExample(OFX::ImageEffect& p_Instance);
 
-    virtual void processImagesCUDA();
+    virtual void processImagesCuda();
     virtual void processImagesOpenCL();
     virtual void processImagesMetal();
     virtual void multiThreadProcessImages(OfxRectI p_ProcWindow);
@@ -50,7 +50,7 @@ GainExample::GainExample(OFX::ImageEffect& p_Instance)
 extern void RunCudaKernel(void* p_Stream, int p_Width, int p_Height, float* p_Gain, const float* p_Input, float* p_Output);
 #endif
 
-void GainExample::processImagesCUDA()
+void GainExample::processImagesCuda()
 {
 #ifdef OFX_SUPPORTS_CUDARENDER
     const OfxRectI& bounds = _srcImg->getBounds();
@@ -82,7 +82,8 @@ void GainExample::processImagesMetal()
 #endif
 }
 
-extern void RunOpenCLKernel(void* p_CmdQ, int p_Width, int p_Height, float* p_Gain, const float* p_Input, float* p_Output);
+extern void RunOpenCLKernelBuffers(void* p_CmdQ, int p_Width, int p_Height, float* p_Gain, const float* p_Input, float* p_Output);
+extern void RunOpenCLKernelImages(void* p_CmdQ, int p_Width, int p_Height, float* p_Gain, const float* p_Input, float* p_Output);
 
 void GainExample::processImagesOpenCL()
 {
@@ -91,10 +92,22 @@ void GainExample::processImagesOpenCL()
     const int width = bounds.x2 - bounds.x1;
     const int height = bounds.y2 - bounds.y1;
 
-    float* input = static_cast<float*>(_srcImg->getPixelData());
-    float* output = static_cast<float*>(_dstImg->getPixelData());
+    float* input = static_cast<float*>(_srcImg->getOpenCLImage());
+    float* output = static_cast<float*>(_dstImg->getOpenCLImage());
+
+    // if a plugin supports both OpenCL Buffers and Images, the host decides which is used and
+    // the plugin must determine which based on whether kOfxImageEffectPropOpenCLImage or kOfxImagePropData is set
+    if (input || output)
+    {
+        RunOpenCLKernelImages(_pOpenCLCmdQ, width, height, _scales, input, output);
+    }
+    else
+    {
+        input = static_cast<float*>(_srcImg->getPixelData());
+        output = static_cast<float*>(_dstImg->getPixelData());
 
-    RunOpenCLKernel(_pOpenCLCmdQ, width, height, _scales, input, output);
+        RunOpenCLKernelBuffers(_pOpenCLCmdQ, width, height, _scales, input, output);
+}
 #endif
 }
 
@@ -351,7 +364,8 @@ void GPUGainFactory::describe(OFX::ImageEffectDescriptor& p_Desc)
     p_Desc.setSupportsMultipleClipPARs(kSupportsMultipleClipPARs);
 
     // Setup OpenCL render capability flags
-    p_Desc.setSupportsOpenCLRender(true);
+    p_Desc.setSupportsOpenCLBuffersRender(true);
+    p_Desc.setSupportsOpenCLImagesRender(true);
 
     // Setup CUDA render capability flags on non-Apple system
 #ifndef __APPLE__

diff --git a/Examples/GPUGain/OpenCLKernel.cpp b/Examples/GPUGain/OpenCLKernel.cpp
@@ -15,8 +15,8 @@
 #include <CL/cl.h>
 #endif
 
-const char *KernelSource = "\n" \
-"__kernel void GainAdjustKernel(                                        \n" \
+const char *KernelSourceBuffers = "\n" \
+"__kernel void GainAdjustKernelBuffers(                                 \n" \
 "   int p_Width,                                                        \n" \
 "   int p_Height,                                                       \n" \
 "   float p_GainR,                                                      \n" \
@@ -41,6 +41,32 @@ const char *KernelSource = "\n" \
 "}                                                                      \n" \
 "\n";
 
+const char *KernelSourceImages = "\n" \
+"__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;  \n" \
+"                                                                                                           \n" \
+"__kernel void GainAdjustKernelImages(                                                                      \n" \
+"   int p_Width,                                                                                            \n" \
+"   int p_Height,                                                                                           \n" \
+"   float p_GainR,                                                                                          \n" \
+"   float p_GainG,                                                                                          \n" \
+"   float p_GainB,                                                                                          \n" \
+"   float p_GainA,                                                                                          \n" \
+"   __read_only  image2d_t p_Input,                                                                         \n" \
+"   __write_only image2d_t p_Output)                                                                        \n" \
+"{                                                                                                          \n" \
+"   const int x = get_global_id(0);                                                                         \n" \
+"   const int y = get_global_id(1);                                                                         \n" \
+"                                                                                                           \n" \
+"   if ((x < p_Width) && (y < p_Height))                                                                    \n" \
+"   {                                                                                                       \n" \
+"       int2 coord = (int2)(x, y);                                                                          \n" \
+"       float4 out = read_imagef(p_Input, imageSampler, coord);                                             \n" \
+"       out *= (float4)(p_GainR, p_GainG, p_GainB, p_GainA);                                                \n" \
+"       write_imagef(p_Output, coord, out);                                                                 \n" \
+"   }                                                                                                       \n" \
+"}                                                                                                          \n" \
+"\n";
+
 void CheckError(cl_int p_Error, const char* p_Msg)
 {
     if (p_Error != CL_SUCCESS)
@@ -96,7 +122,7 @@ class Locker
 #endif
 };
 
-void RunOpenCLKernel(void* p_CmdQ, int p_Width, int p_Height, float* p_Gain, const float* p_Input, float* p_Output)
+void RunOpenCLKernelBuffers(void* p_CmdQ, int p_Width, int p_Height, float* p_Gain, const float* p_Input, float* p_Output)
 {
     cl_int error;
 
@@ -132,13 +158,13 @@ void RunOpenCLKernel(void* p_CmdQ, int p_Width, int p_Height, float* p_Gain, con
         error = clGetCommandQueueInfo(cmdQ, CL_QUEUE_CONTEXT, sizeof(cl_context), &clContext, NULL);
         CheckError(error, "Unable to get the context");
 
-        cl_program program = clCreateProgramWithSource(clContext, 1, (const char **)&KernelSource, NULL, &error);
+        cl_program program = clCreateProgramWithSource(clContext, 1, (const char **)&KernelSourceBuffers, NULL, &error);
         CheckError(error, "Unable to create program");
 
         error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
         CheckError(error, "Unable to build program");
 
-        kernel = clCreateKernel(program, "GainAdjustKernel", &error);
+        kernel = clCreateKernel(program, "GainAdjustKernelBuffers", &error);
         CheckError(error, "Unable to create kernel");
 
         kernelMap[cmdQ] = kernel;
@@ -169,3 +195,77 @@ void RunOpenCLKernel(void* p_CmdQ, int p_Width, int p_Height, float* p_Gain, con
 
     clEnqueueNDRangeKernel(cmdQ, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
 }
+
+void RunOpenCLKernelImages(void* p_CmdQ, int p_Width, int p_Height, float* p_Gain, const float* p_Input, float* p_Output)
+{
+    cl_int error;
+
+    cl_command_queue cmdQ = static_cast<cl_command_queue>(p_CmdQ);
+
+    // store device id and kernel per command queue (required for multi-GPU systems)
+    static std::map<cl_command_queue, cl_device_id> deviceIdMap;
+    static std::map<cl_command_queue, cl_kernel> kernelMap;
+
+    static Locker locker; // simple lock to control access to the above maps from multiple threads
+
+    locker.Lock();
+
+    // find the device id corresponding to the command queue
+    cl_device_id deviceId = NULL;
+    if (deviceIdMap.find(cmdQ) == deviceIdMap.end())
+    {
+        error = clGetCommandQueueInfo(cmdQ, CL_QUEUE_DEVICE, sizeof(cl_device_id), &deviceId, NULL);
+        CheckError(error, "Unable to get the device");
+
+        deviceIdMap[cmdQ] = deviceId;
+    }
+    else
+    {
+        deviceId = deviceIdMap[cmdQ];
+    }
+
+    // find the program kernel corresponding to the command queue
+    cl_kernel kernel = NULL;
+    if (kernelMap.find(cmdQ) == kernelMap.end())
+    {
+        cl_context clContext = NULL;
+        error = clGetCommandQueueInfo(cmdQ, CL_QUEUE_CONTEXT, sizeof(cl_context), &clContext, NULL);
+        CheckError(error, "Unable to get the context");
+
+        cl_program program = clCreateProgramWithSource(clContext, 1, (const char **)&KernelSourceImages, NULL, &error);
+        CheckError(error, "Unable to create program");
+
+        error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+        CheckError(error, "Unable to build program");
+
+        kernel = clCreateKernel(program, "GainAdjustKernelImages", &error);
+        CheckError(error, "Unable to create kernel");
+
+        kernelMap[cmdQ] = kernel;
+    }
+    else
+    {
+        kernel = kernelMap[cmdQ];
+    }
+
+    locker.Unlock();
+
+    int count = 0;
+    error = clSetKernelArg(kernel, count++, sizeof(int), &p_Width);
+    error |= clSetKernelArg(kernel, count++, sizeof(int), &p_Height);
+    error |= clSetKernelArg(kernel, count++, sizeof(float), &p_Gain[0]);
+    error |= clSetKernelArg(kernel, count++, sizeof(float), &p_Gain[1]);
+    error |= clSetKernelArg(kernel, count++, sizeof(float), &p_Gain[2]);
+    error |= clSetKernelArg(kernel, count++, sizeof(float), &p_Gain[3]);
+    error |= clSetKernelArg(kernel, count++, sizeof(cl_mem), &p_Input);
+    error |= clSetKernelArg(kernel, count++, sizeof(cl_mem), &p_Output);
+    CheckError(error, "Unable to set kernel arguments");
+
+    size_t localWorkSize[2], globalWorkSize[2];
+    clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), localWorkSize, NULL);
+    localWorkSize[1] = 1;
+    globalWorkSize[0] = ((p_Width + localWorkSize[0] - 1) / localWorkSize[0]) * localWorkSize[0];
+    globalWorkSize[1] = p_Height;
+
+    clEnqueueNDRangeKernel(cmdQ, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+}
diff --git a/Support/Library/ofxsImageEffect.cpp b/Support/Library/ofxsImageEffect.cpp
@@ -667,12 +667,18 @@ namespace OFX {
     }
   }
 
-  /** @brief Does the plugin support OpenCL Render */
-  void ImageEffectDescriptor::setSupportsOpenCLRender(bool v)
+  /** @brief Does the plugin support OpenCL Buffers Render */
+  void ImageEffectDescriptor::setSupportsOpenCLBuffersRender(bool v)
   {
       _effectProps.propSetString(kOfxImageEffectPropOpenCLRenderSupported, (v ? "true" : "false"));
   }
 
+  /** @brief Does the plugin support OpenCL Images Render */
+  void ImageEffectDescriptor::setSupportsOpenCLImagesRender(bool v)
+  {
+      _effectProps.propSetString(kOfxImageEffectPropOpenCLSupported, (v ? "true" : "false"));
+  }
+
   /** @brief Does the plugin support CUDA Render */
   void ImageEffectDescriptor::setSupportsCudaRender(bool v)
   {
@@ -759,7 +765,7 @@ namespace OFX {
     OFX::Validation::validateImageBaseProperties(props);
 
     // and fetch all the properties
-    _rowBytes         = _imageProps.propGetInt(kOfxImagePropRowBytes);
+    _rowBytes         = _imageProps.propGetInt(kOfxImagePropRowBytes, /*throwOnFailure*/false); // not required for OpenCL Images
     _pixelAspectRatio = _imageProps.propGetDouble(kOfxImagePropPixelAspectRatio);;
 
     std::string str  = _imageProps.propGetString(kOfxImageEffectPropComponents);
@@ -849,8 +855,10 @@ namespace OFX {
     OFX::Validation::validateImageProperties(props);
 
     // and fetch all the properties
+    _OpenCLImage = nullptr;
+    _OpenCLImage = _imageProps.propGetPointer(kOfxImageEffectPropOpenCLImage, /*throwOnFailure*/false);
     // should throw if it is not an image
-    _pixelData = _imageProps.propGetPointer(kOfxImagePropData);
+    _pixelData = _imageProps.propGetPointer(kOfxImagePropData, /*throwOnFailure*/!_OpenCLImage);
   }
 
   Image::~Image()

diff --git a/Support/include/ofxsImageEffect.h b/Support/include/ofxsImageEffect.h
@@ -434,13 +434,16 @@ namespace OFX {
     /** @brief If the slave  param changes the clip preferences need to be re-evaluated */
     void addClipPreferencesSlaveParam(ParamDescriptor &p);
 
-    /** @brief Does the plugin support OpenCL Render, defaults to false */
-    void setSupportsOpenCLRender(bool v);
+    /** @brief Does the plugin support OpenCL Buffers Render, defaults to false */
+    void setSupportsOpenCLBuffersRender(bool v);
+
+    /** @brief Does the plugin support OpenCL Images Render, defaults to false */
+    void setSupportsOpenCLImagesRender(bool v);
 
     /** @brief Does the plugin support CUDA Render, defaults to false */
     void setSupportsCudaRender(bool v);
 
-    /** @brief Does the plugin support CUDA Render, defaults to false */
+    /** @brief Does the plugin support CUDA Stream Render, defaults to false */
     void setSupportsCudaStream(bool v);
 
     /** @brief Does the plugin support Metal Render, defaults to false */
@@ -550,6 +553,7 @@ namespace OFX {
   class Image : public ImageBase {
   protected :
     void     *_pixelData;                    /**< @brief the base address of the image */
+    void     *_OpenCLImage;                  /**< @brief the OpenCL Image handle */
 
   public :
     /** @brief ctor */
@@ -564,6 +568,12 @@ namespace OFX {
     /** @brief get the pixel data for this image */
     const void *getPixelData(void) const { return _pixelData;}
 
+    /** @brief get the OpenCL Image for this image */
+    void *getOpenCLImage(void) { return _OpenCLImage;}
+
+    /** @brief get the OpenCL Image for this image */
+    const void *getOpenCLImage(void) const { return _OpenCLImage;}
+
     /** @brief return a pixel pointer, returns NULL if (x,y) is outside the image bounds
 
     x and y are in pixel coordinates

diff --git a/Support/include/ofxsProcessing.h b/Support/include/ofxsProcessing.h
@@ -145,9 +145,9 @@ namespace OFX {
         };
 
         /** @brief this is called by process to actually process images using CUDA when isEnabledCudaRender is true, override in derived classes */
-        virtual void processImagesCUDA(void)
+        virtual void processImagesCuda(void)
         {
-            OFX::Log::print("processImagesCUDA not implemented");
+            OFX::Log::print("processImagesCuda not implemented");
             OFX::throwSuiteStatusException(kOfxStatErrUnsupported);
         };
 
@@ -197,7 +197,7 @@ namespace OFX {
             else if (_isEnabledCudaRender)
             {
               OFX::Log::print("processing via CUDA");
-                processImagesCUDA();
+                processImagesCuda();
             }
             else if (_isEnabledMetalRender)
             {