Added phenix measurements

TheJacksonLaboratory · Jul 10, 2020 · a9e263e · a9e263e
1 parent 7af3664
commit a9e263e
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 9 deletions.
diff --git a/post_processing/hcs_data_processing.py b/post_processing/hcs_data_processing.py
@@ -13,7 +13,10 @@
 class HighContentScreen:
 
     def __init__(self, input_form_path, overwrite=False):
-        self._allowed_measurements = {"spectramax": self._load_spectramax}
+        self._allowed_measurements = {
+            "spectramax": self._load_spectramax,
+            "phenix": self._load_phenix
+        }
         self.has_randomization = False
         self.input_form_path = input_form_path
         self.overwrite_allowed = overwrite
@@ -130,7 +133,7 @@ def map_randomization(self):
         logger.debug("Dilution and drug data mapped using randomization mapping.")
 
     @staticmethod
-    def _load_spectramax(spectramax_file):
+    def _load_spectramax(spectramax_file, ignored):
         data = (
             pd.read_table(
                 spectramax_file,
@@ -145,18 +148,37 @@ def _load_spectramax(spectramax_file):
         )
         flat_data = utils.flatten_plate_map(data, colwise=False)
         logger.debug("Spectramax data constructed successfully.")
-        return utils.construct_384(flat_data, "spectramax", colwise=False)
+        return utils.construct_384(flat_data, "spectramax", colwise=False), None
+
+    @staticmethod
+    def _load_phenix(phenix_file, phenix_columns):
+        data = pd.read_table(phenix_file, index_col=2)
+        cols = utils.parse_column_spec(phenix_columns)
+        logger.debug(f"Will use phenix data columns: [{cols}]")
+        try:
+            data = data.loc[:, cols]
+        except:
+            data = data.iloc[:, cols]
+        logger.debug("Phenix data constructed successfully.")
+        if len(data.shape) == 1:
+            data = data.to_frame()
+        data.columns = "Phenix - " + data.columns
+        return data, data.columns.tolist()
 
     def load_measurements(self):
         measurements = []
         self.measurements = []
-        for m_name, m_file in self.measurement_files.items():
+        for m_name, (m_file, m_cols) in self.measurement_files.items():
             assert_path_exists(m_name, m_file)
             if m_name not in self._allowed_measurements.keys():
                 logger.warn(f"Measurement {m_name} not configured. Skipping.")
             logger.debug(f"Trying to load {m_name} from file [{m_file}]")
-            measurements.append(self._allowed_measurements[m_name](m_file))
-            self.measurements.append(m_name)
+            m_data, new_names = self._allowed_measurements[m_name](m_file, m_cols)
+            measurements.append(m_data)
+            if new_names:
+                self.measurements += new_names
+            else:
+                self.measurements.append(m_name)
         logger.debug("All measurements loaded successfully.")
         self.data.append(pd.concat(measurements, axis=1))
 
@@ -256,9 +278,13 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "-m", "--measurement",
         dest="measurements",
-        nargs=2, action="append",
+        nargs="+", action="append",
         required=False,
-        help="Path to Spectra Max export file",
+        help=(
+            "Measurement specification: measurement_name measurement_file "
+            "[columns_to_use]. If [columns_to_use] is not given, will use "
+            " all columns presented."
+        )
     )
     parser.add_argument(
         "-o",
@@ -285,7 +311,10 @@ def parse_args() -> argparse.Namespace:
 
     args = parser.parse_args()
     if args.measurements:
-        args.measurements = dict((item[0], Path(item[1])) for item in args.measurements)
+        args.measurements = dict(
+            (item[0], [Path(item[1]), None if len(item)==2 else item[2]])
+            for item in args.measurements
+        )
 
     return args
 

diff --git a/post_processing/utils.py b/post_processing/utils.py
@@ -1,3 +1,4 @@
+import re
 import typing
 import numpy as np
 import pandas as pd
@@ -14,6 +15,9 @@
 INDEX_96_F = pd.Index(INDEX_96_C.values.reshape(len(ROWS_96), -1).ravel(order="F"))
 INDEX_384_C = pd.Index([f"{r}{c}" for r,c in product(ROWS_384, COLS_384)])
 INDEX_384_F = pd.Index(INDEX_384_C.values.reshape(len(ROWS_384), -1).ravel(order="F"))
+EXCEL_COLS = list(ascii_uppercase) + ["".join(x) for x in product(ascii_uppercase, repeat=2)]
+EXCEL_TO_NUM = dict((letters, num) for num, letters in enumerate(EXCEL_COLS))
+
 
 assert len(INDEX_96_C) == len(INDEX_96_F) == 96
 assert len(INDEX_384_C) == len(INDEX_384_F) == 384
@@ -27,6 +31,27 @@ def sort_index(series):
     return series.reindex(sorted(series.index, key=well_sort))
 
 
+flatten = lambda *n: list(e for a in n
+    for e in (flatten(*a) if isinstance(a, (tuple, list)) else (a,)))
+def parse_column_spec(col_spec):
+    if isinstance(col_spec, int):
+        return col_spec
+    elif all(c.isdigit() for c in col_spec):
+        return int(col_spec)
+    elif col_spec in EXCEL_TO_NUM:
+        return EXCEL_TO_NUM[col_spec] -1
+
+    assert isinstance(col_spec, str)
+    match = re.match("(.*)([:,])(.*)", col_spec)
+    if not match:
+        return col_spec
+    left, sep, right = match.groups()
+    if sep == ":":
+        return slice(parse_column_spec(left), parse_column_spec(right))
+    else:
+        return flatten([parse_column_spec(left), parse_column_spec(right)])
+
+
 def flatten_plate_map(data, colwise=False):
     ravel = "F" if colwise else "C"
     if isinstance(data, pd.Series) or isinstance(data, pd.Index):
@@ -51,6 +76,7 @@ def construct_384(data_384, name, colwise=False):
     index = INDEX_384_F if colwise else INDEX_384_C
     return pd.Series(data_384, name=name, index=index)
 
+
 def index_by_quad(quad, colwise=False):
     assert quad < 4
     ravel = "F" if colwise else "C"
@@ -102,3 +128,12 @@ def assert_path_does_not_exist(name: str, path: Path) -> None:
     if path is not None and path.exists():
         logger.error(f"{name.replace('_',' ').capitalize()}: [{path}] already exist!")
         exit(2)
+
+
+if __name__ == "__main__":
+    print(parse_column_spec(1))
+    print(parse_column_spec("1:4"))
+    print(parse_column_spec("This Column"))
+    print(parse_column_spec("This Column,That Column,The other Column"))
+    print(parse_column_spec("1,4"))
+    print(parse_column_spec("DC,GH"))