Modified functions to use clarifai v10.0.1 (#24)

* modified batch_size args for ingestion script * Update image_ingestion_clarifai_job_nb.ipynb * modified export inputs to df function * addressed clarifai v10.0.1 changes * Modified functions to include text annotations
Clarifai · Jan 18, 2024 · 691aa30 · 691aa30
1 parent b1cf0c3
commit 691aa30
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 11 deletions.
diff --git a/Job_script/image_ingestion_clarifai_job_nb.ipynb b/Job_script/image_ingestion_clarifai_job_nb.ipynb
@@ -27,12 +27,12 @@
     "from clarifaipyspark.client import ClarifaiPySpark\n",
     "\n",
     "\n",
-    "def ingest_image_dataframe(folder_path, user_id, app_id, dataset_id):\n",
+    "def ingest_image_into_app(folder_path, user_id, app_id, dataset_id):\n",
     "    \n",
     "    cspark_obj = ClarifaiPySpark(user_id=user_id, app_id=app_id, pat=dbutils.secrets.get(\"clarifai\",\"CLARIFAI_PAT\"))\n",
     "    dataset_obj = cspark_obj.dataset(dataset_id=dataset_id)\n",
     "    try:\n",
-    "      dataset_obj.upload_dataset_from_folder(folder_path,input_type='image',labels= False)\n",
+    "      dataset_obj.upload_dataset_from_folder(folder_path,input_type='image',labels= False, batch_size = 15)\n",
     "    \n",
     "    except Exception as e:\n",
     "        raise RuntimeError(f\"Error uploading images: {e}\")\n",
@@ -42,7 +42,7 @@
     "app_id=dbutils.widgets.get(\"app_id\")\n",
     "user_id=dbutils.widgets.get(\"user_id\")\n",
     "dataset_id=dbutils.widgets.get(\"dataset_id\")\n",
-    "image_df =ingest_image_dataframe(file_path,user_id,app_id,dataset_id)\n"
+    "image_df =ingest_image_into_app(file_path,user_id,app_id,dataset_id)\n"
    ]
   }
  ],

diff --git a/clarifaipyspark/dataset.py b/clarifaipyspark/dataset.py
@@ -337,7 +337,7 @@ def export_annotations_to_dataframe(self, input_ids: list = None, input_type: st
     response = list(self.list_annotations(input_ids=input_ids, input_type=input_type))
     for an in response:
       temp = {}
-      temp['annotation'] = str(an.data)
+      temp['annotation'] = str(an.data) if not an.data.metadata else {}
       if not temp['annotation'] or temp['annotation'] == '{}':
         continue
       temp['annotation_id'] = an.id
@@ -415,11 +415,8 @@ def export_inputs_to_dataframe(self, input_type):
     """
     if input_type not in ('image', 'text'):
       raise UserError('Invalid input type, it should be image or text')
-
-    search_obj = Search(user_id=self.user_id, app_id=self.app_id, pat=self.pat)
-    search_response = search_obj.query(filters=[{"input_types":["image"]},{"input_dataset_ids":[self.dataset_id]}])
-    inputs=[hit.input for response in search_response for hit in response.hits]
-
+
+    inputs= list(self.list_inputs(input_type=input_type))
     input_list=[]
     for inp in inputs:
       temp = {}
@@ -492,7 +489,7 @@ def export_annotations_to_volume(self, volumepath: str):
     images_to_download=[]
     for an in response:
       temp = {}
-      temp['annotation'] = str(an.data)
+      temp['annotation'] = str(an.data) if not an.data.metadata else {}
       if not temp['annotation'] or temp['annotation'] == '{}':
         continue
       temp['annotation_id'] = an.id

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-clarifai==9.10.3
+clarifai==10.0.1
 pyspark==3.5.0