Documentation WIP

EliLillyCo · Nov 16, 2024 · 40f3dbd · 40f3dbd
1 parent ccbed1b
commit 40f3dbd
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 28 deletions.
diff --git a/docs/GFP/gfp_nearneighbours_single_file.md b/docs/GFP/gfp_nearneighbours_single_file.md
@@ -9,7 +9,7 @@ For comparisons of one collection against another see
 
 For each molecules in a set, find the single nearest neighbour.
 ```
-gfp_make.sh file.smi > file.gfp
+gfp_make.sh <fingerprints> file.smi > file.gfp
 gfp_nearneighbours_single_file -n 1 file.gfp > file.nn
 nplotnn file.nn > file.nn.smi
 ```
@@ -55,6 +55,8 @@ If the `-T` option is used, then for each molecule, all neighbours within that d
 are accumulated, so it is very likely that the molecules will have differing numbers of
 neighbours.
 
+This tool can be used to compute the nearest neighbour data used by [train_test_split_optimise](train_test_split_optimise.md).
+
 ### TBB
 There is a version of this tool that uses TBB (Threading Building Blocks) to subdivide
 the work across threads. Many of the options are the same, but the most important is
@@ -69,6 +71,7 @@ of parallelism is approximately half of the -h value. So, on an 8 core machine
 -h 16 will result in almost full use of the machine, but as usual, scaling is
 not linear.
 ![speed](Images/gfp_nearneighbours_single_file_tbb.png)
+
 Although in this case, serial execution takes 85 seconds, so an ideal 8 way
 speed-up would result in -h 16 running in 10.1 seconds, but we observe 11.22
 which is an unusually good speed-up for 8 cores.

diff --git a/docs/GFP/train_test_split_optimise.md b/docs/GFP/train_test_split_optimise.md
@@ -23,23 +23,17 @@ classes is preserved.
 # HOWTO
 First generate fingerprints and neighbours. Use as long a distance as seems reasonable.
 ```
-gfp_make.sh file.smi > file.gfp
-gfp_nearneighbours_single_file -v -T 0.4 file.gfp > file.nn
-```
-Convert to tfdata format
-```
-nn2proto -T file.tfdata -v file.nn
-```
-Note that eventually `gfp_nearneighbours_single_file` will be able to 
-generate this file directly.
-
-Update: That is now available.
-```
-gfp_nearneighbours_single_file -v -T 0.4 -S file.tfdata file.gfp 
+gfp_make.sh <fingeprints> file.smi > file.gfp
+gfp_nearneighbours_single_file -v -T 0.4 -S file.tfdata file.gfp
 ```
+The -S option means that the data is written as TFDataRecord serialized protos.
 Depending on the size of the dataset and the distance selected, this
 file can be large.
 
+To save time and take advantage of multiple cores consider using 
+[gfp_nearneighbours_single_file_tbb](gfp_nearneighbours_single_file_tbb.md) which
+runs the nearest neighbour determination using multiple threads.
+
 Run the optimisation.
 ```
 train_test_split_optimise -f 0.8 -n 10 -S SPLIT -o 1000000 -v -r 10000 file.tfdata

diff --git a/src/Utilities/GFP_Tools/gfp_nearneighbours_single_file_tbb.cc b/src/Utilities/GFP_Tools/gfp_nearneighbours_single_file_tbb.cc
@@ -649,23 +649,19 @@ gfp_nearneighbours_diagonal(F* pool, const int istart, const int istop)
   return;
 }
 
-/*
-  Not really parallel at all
-*/
+//  Not really parallel at all
 
 template <typename F>
 void
-gfp_nearneighbours_parallel1(F* pool, const int pool_size)
-{
+gfp_nearneighbours_parallel1(F* pool, const int pool_size) {
   gfp_nearneighbours_diagonal(pool, 0, pool_size);
 
   return;
 }
 
 template <typename F>
 void
-gfp_nearneighbours_parallel2(F* pool, const int pool_size)
-{
+gfp_nearneighbours_parallel2(F* pool, const int pool_size) {
   const int half = pool_size / 2;
 
   tbb::task_group g1;
@@ -686,8 +682,7 @@ gfp_nearneighbours_parallel2(F* pool, const int pool_size)
 
 template <typename F>
 void
-gfp_nearneighbours_parallel3(F* pool, const int pool_size)
-{
+gfp_nearneighbours_parallel3(F* pool, const int pool_size) {
   const int third = pool_size / 3;
   const int two_thirds = (2 * pool_size) / 3;
 
@@ -717,6 +712,8 @@ gfp_nearneighbours_parallel16_latin_square(F* pool, const int pool_size)
 
   p[16] = pool_size;
 
+  // Written by gfp_nearneighbours_single_file_tbb.rb
+
   tbb::task_group g0;
   g0.run([&] { gfp_nearneighbours(pool, p[0], p[1], p[1], p[2]); });
   g0.run([&] { gfp_nearneighbours(pool, p[2], p[3], p[3], p[4]); });
@@ -933,22 +930,19 @@ gfp_nearneighbours_parallel16_latin_square(F* pool, const int pool_size)
   Idea from Steve Ruberg
 
   Actually this is kind of like a Latin Square, but I ended up writing a ruby
-  script, `latin_square.rb` that does this explicitly.
+  script that does this explicitly.
 */
 
 template <typename F>
 void
-gfp_nearneighbours_parallel8_latin_square(F* pool, const int pool_size)
-{
+gfp_nearneighbours_parallel8_latin_square(F* pool, const int pool_size) {
   int p[9];
   for (int i = 0; i < 8; ++i) {
     p[i] = (i * pool_size) / 8;
   }
 
   p[8] = pool_size;
 
-  // This code is written by latin_square.rb
-
   tbb::task_group g1;
   g1.run([&] { gfp_nearneighbours(pool, p[0], p[1], p[7], p[8]); });
   g1.run([&] { gfp_nearneighbours(pool, p[1], p[2], p[6], p[7]); });