Skip to content

Commit

Permalink
Gazelle updates to BUILD files, concat_files
Browse files Browse the repository at this point in the history
  • Loading branch information
IanAWatson committed Nov 27, 2024
1 parent fdf975d commit 6527a61
Show file tree
Hide file tree
Showing 11 changed files with 147 additions and 78 deletions.
7 changes: 4 additions & 3 deletions docs/Workflows/train_test_split_optimise.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

Based on the very nice post by Pat Walters
[Practical Cheminformatics](https://practicalcheminformatics.blogspot.com/2024/11/some-thoughts-on-splitting-chemical.html)
I decided to apply train_test_split_optimise to the Biogen ADME datasets
I decided toapply trawn_test_split_optimise to the Biogen ADME datasets
he uses.

Pat covers various splitting strategies that I will not recreate here. Instead
Expand All @@ -20,7 +20,8 @@ I will compare the Biogen features with a smaller set of 285 features
from LillyMol [iwdescr](/docs/Molecule_Tools/iwdescr.md) and some Abraham features.

Build XGBoost models on the Biogen training sets for both the Biogen features and
the LillyMol features. Measure R2, so higher numbers are better. All models are built
the LillyMol features. Score the Biogen training sets and measure R2, so higher numbers
are better. All models are built
in about 10 seconds. No hyperparamter tuning was attempted, use the defaults built
into LillyMol script [xgbd_make](/contrib/bin/xgbd/xgbd_make.py).

Expand All @@ -36,7 +37,7 @@ into LillyMol script [xgbd_make](/contrib/bin/xgbd/xgbd_make.py).
Generally the 285 LillyMol features perform better - although this is not universally
true, and when the Biogen features are better, the difference is small. Use the LillyMol
features going forward, especially since there are many fewer of them than what came
from Biogen.
from Biogen. Also the LillyMol features do not have any NaN values.

Using RMS as a measure of performance shows similar results, smaller numbers are better
| DataSet | Biogen | LillyMol |
Expand Down
2 changes: 1 addition & 1 deletion src/Foundational/data_source/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ cc_library(
hdrs = [
"iwrecordio.h",
],
visibility = ["//visibility:public"],
deps = [
"//Foundational/iwmisc",
"//Foundational/iwstring",
"@com_google_protobuf//:protobuf",
],
visibility = ["//visibility:public"],
)

proto_library(
Expand Down
1 change: 0 additions & 1 deletion src/Foundational/iwmisc/BUILD
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
load("@rules_go//proto:def.bzl", "go_proto_library")
load("@rules_proto//proto:defs.bzl", "proto_library")

proto_library(
Expand Down
2 changes: 2 additions & 0 deletions src/MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ bazel_dep(name = "abseil-cpp", version = "20240116.2", repo_name="com_google_abs
bazel_dep(name = "bazel_skylib", version = "1.7.1")
bazel_dep(name = "caseyduquettesc_rules_python_pytest", version = "1.1.0", repo_name = "rules_python_pytest")
bazel_dep(name = "eigen", version = "3.4.0", repo_name="com_gitlab_libeigen_eigen")
# bazel_dep(name = "gazelle", version = "0.38.0")
bazel_dep(name = "highwayhash", version = "0.0.0-20240305-5ad3bf8")

# Does not work inside Lilly, link error half way through the build.
Expand All @@ -36,6 +37,7 @@ bazel_dep(name = "pybind11_bazel", version = "2.13.6")
bazel_dep(name = "re2", version = "2024-07-02")
bazel_dep(name = "rules_cc", version = "0.0.9")
bazel_dep(name = "rules_go", version = "0.49.0")
# bazel_dep(name = "rules_proto_grpc_go", version = "5.0.1")
bazel_dep(name = "rules_pkg", version = "0.10.1")
bazel_dep(name = "rules_proto", version = "6.0.2")
bazel_dep(name = "rules_python", version = "0.39.0")
Expand Down
28 changes: 14 additions & 14 deletions src/Utilities/GeneExpression/BUILD
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
load("@rules_proto//proto:defs.bzl", "proto_library")
load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
load("@rules_proto//proto:defs.bzl", "proto_library")
load("//build_deps:install.bzl", "local_install")

local_install(
name = "install",
srcs = [
":gene_expression_to_proto",
":gene_expression_nearneighbours",
":gene_expression_to_proto",
],
)

Expand All @@ -15,23 +15,23 @@ proto_library(
srcs = [
"gene_expression.proto",
],
visibility = ["//visibility:public"],
)

cc_proto_library(
name = "gene_expression_cc_proto",
deps = [
":gene_expression_proto",
]
],
)

py_proto_library(
name = "gene_expression_py_proto",
srcs = [
"gene_expression.proto",
]
],
)


cc_library(
name = "gene_expression_lib",
srcs = [
Expand All @@ -42,7 +42,7 @@ cc_library(
],
deps = [
":gene_expression_cc_proto",
]
],
)

cc_library(
Expand All @@ -54,12 +54,12 @@ cc_library(
"needle.h",
],
deps = [
"//Foundational/accumulator",
"//Foundational/iwstring:iwstring",
":gene_expression_cc_proto",
"//Foundational/accumulator",
"//Foundational/iwstring",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_protobuf//:protobuf",
]
],
)

#cc_binary(
Expand All @@ -79,6 +79,9 @@ cc_binary(
srcs = [
"gene_expression_nearneighbours.cc",
],
tags = [
"gene_expression",
],
deps = [
":gene_expression_cc_proto",
":gene_expression_lib",
Expand All @@ -88,9 +91,6 @@ cc_binary(
"//Foundational/data_source:iwtfdata_record",
"@com_google_protobuf//:protobuf",
],
tags = [
"gene_expression",
],
)

cc_binary(
Expand All @@ -105,10 +105,10 @@ cc_binary(
"//Foundational/data_source:iwstring_data_source",
"//Foundational/data_source:iwtfdata_record",
"@com_google_protobuf//:protobuf",
]
],
)

cc_test (
cc_test(
name = "needle_test",
srcs = [
"needle_test.cc",
Expand Down
63 changes: 34 additions & 29 deletions src/Utilities/General/concat_files.cc
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ preprocess_to_identifier(const_IWSubstring& buffer, const int identifier_column,
do_trim_leading_zeros_from_identifiers(buffer);
}

if (0 == buffer.length()) {
if (buffer.empty()) {
cerr << "preprocess_to_identifier:zero length identifier, column "
<< identifier_column << " sep '" << sep << "'\n";
return 0;
Expand Down Expand Up @@ -331,7 +331,7 @@ recognise_file_qualifiers(const char* s, char& sep, int& col, int& quot,
while (buffer.nextword(token, i, ',')) {
// cerr << "recognise_file_qualifiers:examining '" << token << "'\n";

if (0 == fname.length()) {
if (fname.empty()) {
fname = token;
} else if (token.starts_with("sep=")) {
token.remove_leading_chars(4);
Expand All @@ -358,8 +358,7 @@ recognise_file_qualifiers(const char* s, char& sep, int& col, int& quot,
}
}

if (0 == fname.length()) // should never happen
{
if (fname.empty()) { // should never happen
cerr << "recognise_file_qualifiers:file name not specified\n";
return 0;
}
Expand Down Expand Up @@ -434,7 +433,7 @@ AFile::initialise(const char* fname) {
buffer.strip_leading_blanks();
buffer.strip_trailing_blanks();

if (0 == buffer.length()) {
if (buffer.empty()) {
cerr << "Blank line at line " << iwstring_data_source::lines_read() << " in '"
<< fname << "'\n";

Expand Down Expand Up @@ -830,29 +829,35 @@ usage(int rc) {
#endif
// clang-format on
// clang-format off
cerr << "Concatenates descriptor files by joining on identifiers\n";
cerr << " set per file settings with 'fname,sep=comma,col=4'\n";
cerr << " -u truncate identifiers at first '_' char\n";
cerr << " -a write all identifiers (includes those not in first file)\n";
cerr << " -M <missing> missing value string (default " << missing_value << ")\n";
cerr << " -d skip duplicate identifiers in the first file\n";
cerr << " -f first file may contain duplicate ID's\n";
cerr << " -g ignore duplicate identifiers in files\n";
cerr << " -c <column> identifier column(s) (default 1)\n";
cerr << " -z trim leading zero's from identifiers\n";
cerr << " -I only write records for which identifier is present in every file\n";
cerr << " -K <fname> write identifiers discarded by -I option to <fname>\n";
cerr << " -n input files are NOT descriptor files - header records not special\n";
cerr << " -k skip blank lines in all files\n";
cerr << " -s ignore case when comparing identifiers\n";
cerr << " -D die stop processing if duplicate descriptor names are encountered\n";
cerr << " -D rm remove duplicate descriptors\n";
cerr << " -D disambiguate assign new unique names to duplicate descriptors\n";
cerr << " -i <sep> input file separator (default space)\n";
cerr << " -o <sep> output file separator (default space)\n";
cerr << " -q input consists of quoted fields\n";
cerr << " -Y ... other options, enter '-Y help' for info\n";
cerr << " -v verbose output\n";
cerr << R"(
Concatenates/joins descriptor files by joining on identifiers\n";
By default, identifiers are assumed to be in column 1 of each file.
File names can be annotated with per-file specifications of the input separator
and the identifier column. For example
concat_files file1,sep=comma,col=2 file2,sep=space,col=1 > combined.txt
-u truncate identifiers at first '_' char
-a write all identifiers (includes those not in first file)
-M <missing> missing value string (default " << missing_value << ")
-d skip duplicate identifiers in the first file
-f first file may contain duplicate ID's
-g ignore duplicate identifiers in files
-c <column> identifier column(s) (default 1)
-z trim leading zero's from identifiers
-I only write records for which identifier is present in every file
-K <fname> write identifiers discarded by -I option to <fname>
-n input files are NOT descriptor files - header records not special
-k skip blank lines in all files
-s ignore case when comparing identifiers
-D die stop processing if duplicate descriptor names are encountered
-D rm remove duplicate descriptors
-D disambiguate assign new unique names to duplicate descriptors
-i <sep> input file separator (default space)
-o <sep> output file separator (default space)
-q input consists of quoted fields
-Y ... other options, enter '-Y help' for info
-v verbose output
)";
// clang-format on

exit(rc);
Expand Down Expand Up @@ -1104,7 +1109,7 @@ concat_files(iwstring_data_source& input, const char input_separator,
// buffer.strip_trailing_blanks(); no, breaks with tab separated input
// buffer.strip_leading_blanks();

if (0 == buffer.length()) {
if (buffer.empty()) {
if (skip_blank_lines) {
continue;
}
Expand Down
47 changes: 27 additions & 20 deletions src/go/BUILD
Original file line number Diff line number Diff line change
@@ -1,56 +1,63 @@
load("@rules_go//go:def.bzl", "go_binary", "go_test")
load("@gazelle//:def.bzl", "gazelle")
load("@rules_go//go:def.bzl", "go_binary", "go_library")
load("//build_deps:install.bzl", "local_install")

gazelle(name = "gazelle")

local_install(
name = "install",
srcs = [
":grep_sdf",
":parallel_process_file",
":no_spaces_in_file_name",
":parallel_process_file",
":regression_to_classification",
":rxn_reverse",
":rxnsmiles2smi",
],
)

# gazelle:prefix xgboost

go_binary(
name = "parallel_process_file",
srcs = [
"parallel_process_file.go",
],
srcs = ["parallel_process_file.go"],
)

go_binary(
name = "no_spaces_in_file_name",
srcs = [
"no_spaces_in_file_name.go",
],
srcs = ["no_spaces_in_file_name.go"],
)

go_binary(
name = "regression_to_classification",
srcs = [
"regression_to_classification.go",
],
srcs = ["regression_to_classification.go"],
)

go_binary(
name = "rxn_reverse",
srcs = [
"rxn_reverse.go",
],
srcs = ["rxn_reverse.go"],
)

go_binary(
name = "rxnsmiles2smi",
srcs = [
"rxnsmiles2smi.go",
],
srcs = ["rxnsmiles2smi.go"],
)

go_binary(
name = "grep_sdf",
srcs = [
"grep_sdf.go",
],
srcs = ["grep_sdf.go"],
)

# Could never get this to work.
# Problem came down to trying to make the import
# google.golang.org/protobuf/proto
# avaialble, and I was never able to figure that out.

# go_binary(
# name = "xgbd_evaluate",
# srcs = ["xgbd_evaluate.go"],
# deps = [
# "//xgboost:xgboost_model_go_proto",
# we need something here for the missing dependency.
# ],
# )
Loading

0 comments on commit 6527a61

Please sign in to comment.