Gazelle updates to BUILD files, concat_files

EliLillyCo · Nov 27, 2024 · 6527a61 · 6527a61
1 parent fdf975d
commit 6527a61
Show file tree

Hide file tree

Showing 11 changed files with 147 additions and 78 deletions.
diff --git a/docs/Workflows/train_test_split_optimise.md b/docs/Workflows/train_test_split_optimise.md
@@ -2,7 +2,7 @@
 
 Based on the very nice post by Pat Walters
 [Practical Cheminformatics](https://practicalcheminformatics.blogspot.com/2024/11/some-thoughts-on-splitting-chemical.html)
-I decided to apply train_test_split_optimise to the Biogen ADME datasets
+I decided toapply trawn_test_split_optimise to the Biogen ADME datasets
 he uses.
 
 Pat covers various splitting strategies that I will not recreate here. Instead
@@ -20,7 +20,8 @@ I will compare the Biogen features with a smaller set of 285 features
 from LillyMol [iwdescr](/docs/Molecule_Tools/iwdescr.md) and some Abraham features.
 
 Build XGBoost models on the Biogen training sets for both the Biogen features and
-the LillyMol features. Measure R2, so higher numbers are better. All models are built
+the LillyMol features. Score the Biogen training sets and measure R2, so higher numbers
+are better. All models are built
 in about 10 seconds. No hyperparamter tuning was attempted, use the defaults built
 into LillyMol script [xgbd_make](/contrib/bin/xgbd/xgbd_make.py).
 
@@ -36,7 +37,7 @@ into LillyMol script [xgbd_make](/contrib/bin/xgbd/xgbd_make.py).
 Generally the 285 LillyMol features perform better - although this is not universally
 true, and when the Biogen features are better, the difference is small. Use the LillyMol
 features going forward, especially since there are many fewer of them than what came
-from Biogen.
+from Biogen. Also the LillyMol features do not have any NaN values.
 
 Using RMS as a measure of performance shows similar results, smaller numbers are better
 | DataSet | Biogen | LillyMol |

diff --git a/src/Foundational/data_source/BUILD b/src/Foundational/data_source/BUILD
@@ -50,12 +50,12 @@ cc_library(
     hdrs = [
         "iwrecordio.h",
     ],
+    visibility = ["//visibility:public"],
     deps = [
         "//Foundational/iwmisc",
         "//Foundational/iwstring",
         "@com_google_protobuf//:protobuf",
     ],
-    visibility = ["//visibility:public"],
 )
 
 proto_library(

diff --git a/src/Foundational/iwmisc/BUILD b/src/Foundational/iwmisc/BUILD
@@ -1,4 +1,3 @@
-load("@rules_go//proto:def.bzl", "go_proto_library")
 load("@rules_proto//proto:defs.bzl", "proto_library")
 
 proto_library(

diff --git a/src/MODULE.bazel b/src/MODULE.bazel
@@ -20,6 +20,7 @@ bazel_dep(name = "abseil-cpp", version = "20240116.2", repo_name="com_google_abs
 bazel_dep(name = "bazel_skylib", version = "1.7.1")
 bazel_dep(name = "caseyduquettesc_rules_python_pytest", version = "1.1.0", repo_name = "rules_python_pytest")
 bazel_dep(name = "eigen", version = "3.4.0", repo_name="com_gitlab_libeigen_eigen")
+# bazel_dep(name = "gazelle", version = "0.38.0")
 bazel_dep(name = "highwayhash", version = "0.0.0-20240305-5ad3bf8")
 
 # Does not work inside Lilly, link error half way through the build.
@@ -36,6 +37,7 @@ bazel_dep(name = "pybind11_bazel", version = "2.13.6")
 bazel_dep(name = "re2", version = "2024-07-02")
 bazel_dep(name = "rules_cc", version = "0.0.9")
 bazel_dep(name = "rules_go", version = "0.49.0")
+# bazel_dep(name = "rules_proto_grpc_go", version = "5.0.1")
 bazel_dep(name = "rules_pkg", version = "0.10.1")
 bazel_dep(name = "rules_proto", version = "6.0.2")
 bazel_dep(name = "rules_python", version = "0.39.0")

diff --git a/src/Utilities/GeneExpression/BUILD b/src/Utilities/GeneExpression/BUILD
@@ -1,12 +1,12 @@
-load("@rules_proto//proto:defs.bzl", "proto_library")
 load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
+load("@rules_proto//proto:defs.bzl", "proto_library")
 load("//build_deps:install.bzl", "local_install")
 
 local_install(
     name = "install",
     srcs = [
-        ":gene_expression_to_proto",
         ":gene_expression_nearneighbours",
+        ":gene_expression_to_proto",
     ],
 )
 
@@ -15,23 +15,23 @@ proto_library(
     srcs = [
         "gene_expression.proto",
     ],
+    visibility = ["//visibility:public"],
 )
 
 cc_proto_library(
     name = "gene_expression_cc_proto",
     deps = [
         ":gene_expression_proto",
-    ]
+    ],
 )
 
 py_proto_library(
     name = "gene_expression_py_proto",
     srcs = [
         "gene_expression.proto",
-    ]
+    ],
 )
 
-
 cc_library(
     name = "gene_expression_lib",
     srcs = [
@@ -42,7 +42,7 @@ cc_library(
     ],
     deps = [
         ":gene_expression_cc_proto",
-    ]
+    ],
 )
 
 cc_library(
@@ -54,12 +54,12 @@ cc_library(
         "needle.h",
     ],
     deps = [
-        "//Foundational/accumulator",
-        "//Foundational/iwstring:iwstring",
         ":gene_expression_cc_proto",
+        "//Foundational/accumulator",
+        "//Foundational/iwstring",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_protobuf//:protobuf",
-    ]
+    ],
 )
 
 #cc_binary(
@@ -79,6 +79,9 @@ cc_binary(
     srcs = [
         "gene_expression_nearneighbours.cc",
     ],
+    tags = [
+        "gene_expression",
+    ],
     deps = [
         ":gene_expression_cc_proto",
         ":gene_expression_lib",
@@ -88,9 +91,6 @@ cc_binary(
         "//Foundational/data_source:iwtfdata_record",
         "@com_google_protobuf//:protobuf",
     ],
-    tags = [
-        "gene_expression",
-    ],
 )
 
 cc_binary(
@@ -105,10 +105,10 @@ cc_binary(
         "//Foundational/data_source:iwstring_data_source",
         "//Foundational/data_source:iwtfdata_record",
         "@com_google_protobuf//:protobuf",
-    ]
+    ],
 )
 
-cc_test (
+cc_test(
     name = "needle_test",
     srcs = [
         "needle_test.cc",

diff --git a/src/Utilities/General/concat_files.cc b/src/Utilities/General/concat_files.cc
@@ -282,7 +282,7 @@ preprocess_to_identifier(const_IWSubstring& buffer, const int identifier_column,
     do_trim_leading_zeros_from_identifiers(buffer);
   }
 
-  if (0 == buffer.length()) {
+  if (buffer.empty()) {
     cerr << "preprocess_to_identifier:zero length identifier, column "
          << identifier_column << " sep '" << sep << "'\n";
     return 0;
@@ -331,7 +331,7 @@ recognise_file_qualifiers(const char* s, char& sep, int& col, int& quot,
   while (buffer.nextword(token, i, ',')) {
     //  cerr << "recognise_file_qualifiers:examining '" << token << "'\n";
 
-    if (0 == fname.length()) {
+    if (fname.empty()) {
       fname = token;
     } else if (token.starts_with("sep=")) {
       token.remove_leading_chars(4);
@@ -358,8 +358,7 @@ recognise_file_qualifiers(const char* s, char& sep, int& col, int& quot,
     }
   }
 
-  if (0 == fname.length())  // should never happen
-  {
+  if (fname.empty()) { // should never happen
     cerr << "recognise_file_qualifiers:file name not specified\n";
     return 0;
   }
@@ -434,7 +433,7 @@ AFile::initialise(const char* fname) {
     buffer.strip_leading_blanks();
     buffer.strip_trailing_blanks();
 
-    if (0 == buffer.length()) {
+    if (buffer.empty()) {
       cerr << "Blank line at line " << iwstring_data_source::lines_read() << " in '"
            << fname << "'\n";
 
@@ -830,29 +829,35 @@ usage(int rc) {
 #endif
   // clang-format on
   // clang-format off
-  cerr << "Concatenates descriptor files by joining on identifiers\n";
-  cerr << "           set per file settings with 'fname,sep=comma,col=4'\n";
-  cerr << " -u               truncate identifiers at first '_' char\n";
-  cerr << " -a               write all identifiers (includes those not in first file)\n";
-  cerr << " -M <missing>     missing value string (default " << missing_value << ")\n";
-  cerr << " -d               skip duplicate identifiers in the first file\n";
-  cerr << " -f               first file may contain duplicate ID's\n";
-  cerr << " -g               ignore duplicate identifiers in files\n";
-  cerr << " -c <column>      identifier column(s) (default 1)\n";
-  cerr << " -z               trim leading zero's from identifiers\n";
-  cerr << " -I               only write records for which identifier is present in every file\n";
-  cerr << " -K <fname>       write identifiers discarded by -I option to <fname>\n";
-  cerr << " -n               input files are NOT descriptor files - header records not special\n";
-  cerr << " -k               skip blank lines in all files\n";
-  cerr << " -s               ignore case when comparing identifiers\n";
-  cerr << " -D die           stop processing if duplicate descriptor names are encountered\n";
-  cerr << " -D rm            remove duplicate descriptors\n";
-  cerr << " -D disambiguate  assign new unique names to duplicate descriptors\n";
-  cerr << " -i <sep>         input  file separator (default space)\n";
-  cerr << " -o <sep>         output file separator (default space)\n";
-  cerr << " -q               input consists of quoted fields\n";
-  cerr << " -Y ...           other options, enter '-Y help' for info\n";
-  cerr << " -v               verbose output\n";
+  cerr << R"(
+Concatenates/joins descriptor files by joining on identifiers\n";
+By default, identifiers are assumed to be in column 1 of each file.
+File names can be annotated with per-file specifications of the input separator
+and the identifier column. For example
+concat_files file1,sep=comma,col=2 file2,sep=space,col=1 > combined.txt
+
+ -u               truncate identifiers at first '_' char
+ -a               write all identifiers (includes those not in first file)
+ -M <missing>     missing value string (default " << missing_value << ")
+ -d               skip duplicate identifiers in the first file
+ -f               first file may contain duplicate ID's
+ -g               ignore duplicate identifiers in files
+ -c <column>      identifier column(s) (default 1)
+ -z               trim leading zero's from identifiers
+ -I               only write records for which identifier is present in every file
+ -K <fname>       write identifiers discarded by -I option to <fname>
+ -n               input files are NOT descriptor files - header records not special
+ -k               skip blank lines in all files
+ -s               ignore case when comparing identifiers
+ -D die           stop processing if duplicate descriptor names are encountered
+ -D rm            remove duplicate descriptors
+ -D disambiguate  assign new unique names to duplicate descriptors
+ -i <sep>         input  file separator (default space)
+ -o <sep>         output file separator (default space)
+ -q               input consists of quoted fields
+ -Y ...           other options, enter '-Y help' for info
+ -v               verbose output
+)";
   // clang-format on
 
   exit(rc);
@@ -1104,7 +1109,7 @@ concat_files(iwstring_data_source& input, const char input_separator,
     //  buffer.strip_trailing_blanks();     no, breaks with tab separated input
     //  buffer.strip_leading_blanks();
 
-    if (0 == buffer.length()) {
+    if (buffer.empty()) {
       if (skip_blank_lines) {
         continue;
       }

diff --git a/src/go/BUILD b/src/go/BUILD
@@ -1,56 +1,63 @@
-load("@rules_go//go:def.bzl", "go_binary", "go_test")
+load("@gazelle//:def.bzl", "gazelle")
+load("@rules_go//go:def.bzl", "go_binary", "go_library")
 load("//build_deps:install.bzl", "local_install")
 
+gazelle(name = "gazelle")
+
 local_install(
     name = "install",
     srcs = [
         ":grep_sdf",
-        ":parallel_process_file",
         ":no_spaces_in_file_name",
+        ":parallel_process_file",
         ":regression_to_classification",
         ":rxn_reverse",
         ":rxnsmiles2smi",
     ],
 )
 
+# gazelle:prefix xgboost
+
 go_binary(
     name = "parallel_process_file",
-    srcs = [
-        "parallel_process_file.go",
-    ],
+    srcs = ["parallel_process_file.go"],
 )
 
 go_binary(
     name = "no_spaces_in_file_name",
-    srcs = [
-        "no_spaces_in_file_name.go",
-    ],
+    srcs = ["no_spaces_in_file_name.go"],
 )
 
 go_binary(
     name = "regression_to_classification",
-    srcs = [
-        "regression_to_classification.go",
-    ],
+    srcs = ["regression_to_classification.go"],
 )
 
 go_binary(
     name = "rxn_reverse",
-    srcs = [
-        "rxn_reverse.go",
-    ],
+    srcs = ["rxn_reverse.go"],
 )
 
 go_binary(
     name = "rxnsmiles2smi",
-    srcs = [
-        "rxnsmiles2smi.go",
-    ],
+    srcs = ["rxnsmiles2smi.go"],
 )
 
 go_binary(
     name = "grep_sdf",
-    srcs = [
-        "grep_sdf.go",
-    ],
+    srcs = ["grep_sdf.go"],
 )
+
+# Could never get this to work.
+# Problem came down to trying to make the import
+# google.golang.org/protobuf/proto
+# avaialble, and I was never able to figure that out.
+
+# go_binary(
+#     name = "xgbd_evaluate",
+#     srcs = ["xgbd_evaluate.go"],
+#     deps = [
+#         "//xgboost:xgboost_model_go_proto",
+#       we need something here for the missing dependency.
+#     ],
+# )