Skip to content

Commit

Permalink
calibrate and svmfp related
Browse files Browse the repository at this point in the history
  • Loading branch information
IanAWatson committed Dec 7, 2024
1 parent 92937c2 commit e130f81
Show file tree
Hide file tree
Showing 10 changed files with 1,107 additions and 36 deletions.
94 changes: 94 additions & 0 deletions contrib/bin/calibrate_xgbd_client.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env ruby

require 'fileutils'

# Client script for model_calibrate. Builds and scores xgboost models
# using xgbd_make and xgbd_evaluate.

require_relative 'lib/iwcmdline'

def usage(rc)
exit(rc)
end

def main
cl = IWCmdline.new("-v-DESC=sfile-TRactivity=sfile-TEactivity=sfile-PRED=s-STATS=s-TMPDIR=s-uid=s")

unless cl.option_present('DESC')
$stderr << "Must specify descriptor file via the -DESC option\n"
usage(1)
end

unless cl.option_present('TRactivity')
$stderr << "Must specify training set activity file via the -TRactivity option\n"
usage(1)
end

unless cl.option_present('TEactivity')
$stderr << "Must specify testing set activity file via the -TEactivity option\n"
usage(1)
end

unless cl.option_present('PRED')
$stderr << "Must specify predicted values file via the -PRED option\n"
usage(1)
end

unless cl.option_present('STATS')
$stderr << "Must specify statistics file via the -STATS option\n"
usage(1)
end


if cl.option_present('TMPDIR')
tmpdir = cl.value('TMPDIR')
elsif cl.option_present('uid')
uid = cl.value('uid')
tmpdir = "/tmp/calibrate_#{uid}"
else
$stderr << "Must specify either -TMPDIR or unique identifier via the -uid option\n"
usage(1)
end

Dir.mkdir(tmpdir) unless File.directory?(tmpdir)

if cl.unrecognised_options_encountered
$stderr << "Unrecognised options encountered\n"
usage(1)
end

verbose = cl.option_present('v')

descriptors = cl.value('DESC')
train_activity = cl.value('TRactivity')
test_activity = cl.value('TEactivity')

predicted = cl.value('PRED')

results = cl.value('STATS')

tmptrain = File.join(tmpdir, "train.dat")
tmptest = File.join(tmpdir, "test.dat")
mdir = File.join(tmpdir, 'MODEL')

cmd = "descriptor_file_select_rows #{train_activity} #{descriptors} > #{tmptrain}"
system(cmd)
cmd = "descriptor_file_select_rows #{test_activity} #{descriptors} > #{tmptest}"
system(cmd)

cmd = "xgbd_make.sh --mdir #{mdir} --activity #{train_activity} #{tmptrain}"
system(cmd)

cmd = "xgboost_model_evaluate.sh -mdir #{mdir} #{tmptest} > #{predicted}"
system(cmd)

cmd = "iwstats -w -Y allequals -E #{test_activity} -p 2 #{predicted} > #{results}"
system(cmd)

FileUtils.rm_rf(tmpdir)

File.unlink(tmptrain)
File.unlink(tmptest)
end

main
173 changes: 173 additions & 0 deletions contrib/bin/model_calibrate.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#!/usr/bin/env ruby
#
require_relative 'lib/iwcmdline'

def usage(rc)
exit(rc)
end

class TTSplit
attr_accessor :train_smi, :train_activity, :test_smi, :test_activity
def initialize(trs, tra, tes, tea)
@train_smi = trs
@train_activity = tra
@test_smi = tes
@test_activity = tea
end
end

def read_fingerprints(fnames)
result = []
fnames.each do |fname|
File.readlines(fname).each do |line|
result << line.chomp
end
end

result
end

# For each of `niter` splits, look for the appropriate train/test and
# smi/activity files and if present, create a TTSplit object.
# Return an Array of the TTSplit objects found
def gather_split_files(train_stem, test_stem, niter)
result = []
(0...niter).each do |i|
train_smi = "#{train_stem}#{i}.smi"
train_activity = "#{train_stem}#{i}.activity"
test_smi = "#{test_stem}#{i}.smi"
test_activity = "#{test_stem}#{i}.activity"
break unless File.file?(train_smi)
break unless File.file?(train_activity)
break unless File.file?(test_smi)
break unless File.file?(test_activity)
result << TTSplit.new(train_smi, train_activity, test_smi, test_activity)
end

result
end

def make_splits(smiles, activity, niter, trpct)
cmd = "stratified_samples -v -E TEST -R TRAIN -N #{niter} -p #{trpct} -s 1 -S .activity -M #{smiles} #{activity}"
unless system(cmd)
$stderr << "#{cmd} failed\n"
return []
end

return gather_split_files('TRAIN', 'TEST', niter)
end

def main
cl = IWCmdline.new("-v-A=sfile-S=s-niter=ipos-TRpct=ipos--fp=sfile-appendfp=sfile-DESC=s-PRED=s")

if cl.unrecognised_options_encountered
$stderr << "Unrecognised options encountered\n"
usage(1)
end

verbose = cl.option_present('v')

unless cl.option_present('A')
$stderr << "Must specify name of activity file via the -A option\n"
usage(1)
end

activity_fname = cl.value('A')

unless cl.option_present('fp')
$stderr << "Must specify one or more fingerprint files via the -fp option\n"
usage(1)
end

fingerprints = read_fingerprints(cl.values('fp'))

$stderr << "Read #{fingerprints.size} fingerprints\n"

if ARGV.empty?
$stderr << "No smiles specified\n"
usage(1)
end

if ARGV.size > 1
$stderr << "Takes just a single argument - the smiles file\n"
usage(1)
end

smiles = ARGV[0]

niter = if cl.option_present('niter')
cl.value('niter')
else
10
end

$stderr << "Will generate #{niter} splits\n" if verbose

trpct = if cl.option_present('TRpct')
cl.value('TRpct')
else
80
end

stats_stem = "A#{trpct}"

predicted_stem = if cl.option_present('PRED')
cl.value('PRED')
else
'PRED'
end

splits = make_splits(smiles, activity_fname, niter, trpct)
if splits.empty?
$stderr << "Split generation failed\n";
end

descriptor_files = []
cl.values('DESC').each do |desc|
g = Dir.glob(desc)
if g.empty?
$stderr << "Descriptor file glob #{desc} no matches\n"
return 1
end
descriptor_files.concat(g)
end

$stderr << "Processing #{descriptor_files.size} descriptor files\n" if verbose
$stderr << descriptor_files << "\n"

command_file = "model_calibrate.txt"
write_command_file(splits, descriptor_files, fingerprints, predicted_stem, stats_stem, command_file)

0
end

def write_command_file(splits, descriptor_files, fingerprints, predicted_stem, stats_stem, command_file)
$stderr << "Writing #{splits.size} splits with #{fingerprints.size} fingerprints\n"
file = File.open(command_file, "w")
fingerprints.each do |fp|
fptxt = fp.gsub(' ', "")
splits.each_with_index do |split, ndx|
file << "calibrate_svmfp_client.sh -gfp #{fp} -gfp " +
"-TRsmi #{split.train_smi} -TRactivity #{split.train_activity} " +
"-TEsmi #{split.test_smi} -TEactivity #{split.test_activity} " +
"-PRED #{predicted_stem}.#{fptxt}.#{ndx} -STATS #{stats_stem}.#{fptxt}.#{ndx}" +
"\n"
$stderr << "Wrote #{fp} split #{ndx}\n"
end
end

descriptor_files.each_with_index do |dfile, dfile_ndx|
splits.each_with_index do |split, ndx|
file << "calibrate_xgbd_client.sh -DESC #{dfile} " +
"-TRactivity #{split.train_activity} " +
"-TEactivity #{split.test_activity} " +
"-PRED #{predicted_stem}.DSC.#{dfile_ndx} -STATS #{stats_stem}.DSC.#{dfile_ndx}.#{ndx} " +
"-uid #{dfile_ndx}.#{ndx}" +
"\n"
end
end

file.close
end

main
1 change: 1 addition & 0 deletions src/Molecule_Tools/alogp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1577,6 +1577,7 @@ std::optional<float>
ALogP::LogP(Molecule& m) {

// Silently remove any explicit Hydrogen atoms.
m.transform_to_non_isotopic_form();
m.remove_all(1);

const int matoms = m.natoms();
Expand Down
4 changes: 3 additions & 1 deletion src/Molecule_Tools/alogp_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ Options::Options() {
_remove_chirality = 0;
_smiles_tag = "$SMI<";
_identifier_tag = "PCN<";
_alogp_tag = "NCZLOGP<";
_alogp_tag = "NCALOGP<";
_function_as_tdt_filter = 0;
_bit_replicates = 9; // same default as clogp
_flush_after_every_molecule = 0;
Expand Down Expand Up @@ -337,6 +337,8 @@ Options::Preprocess(Molecule& m) {
return 0;
}

// We don't want isotopes.
m.transform_to_non_isotopic_form();
m.remove_all(1); // Always

if (_reduce_to_largest_fragment) {
Expand Down
1 change: 1 addition & 0 deletions src/Molecule_Tools/xlogp_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ Options::Preprocess(Molecule& m) {
return 0;
}

m.transform_to_non_isotopic_form();
m.remove_all(1); // Always

if (_reduce_to_largest_fragment) {
Expand Down
22 changes: 22 additions & 0 deletions src/Utilities/GFP_Tools/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,28 @@ cc_binary(
],
)

cc_binary(
name = "gfp_svmfp_score_v2",
srcs = [
"gfp_svmfp_score_v2.cc",
],
deps = [
":gfp_bit_subset",
":gfp_model_cc_proto",
"//Foundational/accumulator",
"//Foundational/cmdline_v2:cmdline_v2",
"//Foundational/data_source:iwstring_data_source",
"//Foundational/iw_tdt",
"//Foundational/iwmisc",
"//Foundational/iwmisc:normalisation",
"//Foundational/iwmisc:sparse_fp_creator",
"//Foundational/iwqsort",
"//Utilities/General:class_label_translation_cc_proto",
"//Utilities/General:scaler",
"//Utilities/GFP_Tools:gfp",
],
)

cc_binary(
name = "gfp_to_descriptors",
srcs = [
Expand Down
2 changes: 1 addition & 1 deletion src/Utilities/GFP_Tools/gfp_svmfp_score.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ Fingerprint_and_Weight::weighted_similarity(IW_General_Fingerprint& fp,
float& max_similarity)
{
similarity_type_t sim;
if (EQUAL_WEIGHT_TANIMOTO == kernel_function) {
if (EQUAL_WEIGHT_TANIMOTO == kernel_function) [[likely]] {
sim = this->equal_weight_tanimoto(fp);
} else if (EQUAL_WEIGHT_DOT_PRODUCT == kernel_function) {
sim = this->equal_weight_dot_product(fp);
Expand Down
Loading

0 comments on commit e130f81

Please sign in to comment.