CU-8694hukwm: Document the materialising of generator when multiproce… (

CogStack#433) * CU-8694hukwm: Document the materialising of generator when multiprocessing and batching for docs * CU-8694hukwm: Add TODO note for where the generator is materialised * CU-8694hukwm: Add warning from large amounts of generator data (10k items) is materialised by the docs size mp method
mart-r · May 22, 2024 · e46dca8 · e46dca8
1 parent fbe9745
commit e46dca8
Showing 1 changed file with 21 additions and 1 deletion.
diff --git a/medcat/cat.py b/medcat/cat.py
@@ -49,6 +49,8 @@
 
 HAS_NEW_SPACY = has_new_spacy()
 
+MIN_GEN_LEN_FOR_WARN = 10_000
+
 
 class CAT(object):
     """The main MedCAT class used to annotate documents, it is built on top of spaCy
@@ -1526,6 +1528,11 @@ def multiprocessing_batch_docs_size(self,
 
         This method batches the data based on the number of documents as specified by the user.
 
+        NOTE: When providing a generator for `data`, the generator is evaluated (`list(in_data)`)
+              and thus all the data is kept in memory and (potentially) duplicated for use in
+              multiple threads. So if you're using a lot of data, it may be better to use
+              `CAT.multiprocessing_batch_char_size` instead.
+
         PS:
         This method supports Windows.
 
@@ -1550,7 +1557,20 @@ def multiprocessing_batch_docs_size(self,
         if nproc == 0:
             raise ValueError("nproc cannot be set to zero")
 
-        in_data = list(in_data) if isinstance(in_data, Iterable) else in_data
+        # TODO: Surely there's a way to not materialise all of the incoming data in memory?
+        #       This is counter productive for allowing the passing of generators.
+        if isinstance(in_data, Iterable):
+            in_data = list(in_data)
+            in_data_len = len(in_data)
+            if in_data_len > MIN_GEN_LEN_FOR_WARN:
+                # only point this out when it's relevant, i.e over 10k items
+                logger.warning("The `CAT.multiprocessing_batch_docs_size` method just "
+                               f"materialised {in_data_len} items from the generator it "
+                               "was provided. This may use up a considerable amount of "
+                               "RAM, especially since the data may be duplicated across "
+                               "multiple threads when multiprocessing is used. If the "
+                               "process is kiled after this warning, please use the "
+                               "alternative method `multiprocessing_batch_char_size` instead")
         n_process = nproc if nproc is not None else min(max(cpu_count() - 1, 1), math.ceil(len(in_data) / batch_factor))
         batch_size = batch_size if batch_size is not None else math.ceil(len(in_data) / (batch_factor * abs(n_process)))