Skip to content

Commit

Permalink
feat: upgrading to PyPI reinvent models 0.0.14 version (#1)
Browse files Browse the repository at this point in the history
* feat: upgrading to pypi reinvent models 0.0.14 version

* feat: fixed the repo url and version in setup.py
  • Loading branch information
Ashish13898 authored Mar 8, 2022
1 parent 6eb81f2 commit e1cf00d
Show file tree
Hide file tree
Showing 32 changed files with 444 additions and 719 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


class GenerativeModelParametersEnum:
NUMBER_OF_LAYERS = "num_layers"
NUMBER_OF_DIMENSIONS = "num_dimensions"
Expand All @@ -12,4 +14,4 @@ def __getattr__(self, name):

# prohibit any attempt to set any values
def __setattr__(self, key, value):
raise ValueError("No changes allowed.")
raise ValueError("No changes allowed.")
2 changes: 2 additions & 0 deletions reinvent_models/lib_invent/enums/generative_model_regime.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


class GenerativeModelRegimeEnum:
INFERENCE = "inference"
TRAINING = "training"
Expand Down
21 changes: 5 additions & 16 deletions reinvent_models/lib_invent/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ def __init__(self, smiles_list, vocabulary, tokenizer):
self._encoded_list.append(enc)

def __getitem__(self, i):
return torch.tensor(
self._encoded_list[i], dtype=torch.long
) # pylint: disable=E1102
return torch.tensor(self._encoded_list[i], dtype=torch.long) # pylint: disable=E1102

def __len__(self):
return len(self._encoded_list)
Expand All @@ -47,21 +45,14 @@ def __init__(self, scaffold_decoration_smi_list, vocabulary):

self._encoded_list = []
for scaffold, dec in scaffold_decoration_smi_list:
en_scaff = self.vocabulary.scaffold_vocabulary.encode(
self.vocabulary.scaffold_tokenizer.tokenize(scaffold)
)
en_dec = self.vocabulary.decoration_vocabulary.encode(
self.vocabulary.decoration_tokenizer.tokenize(dec)
)
en_scaff = self.vocabulary.scaffold_vocabulary.encode(self.vocabulary.scaffold_tokenizer.tokenize(scaffold))
en_dec = self.vocabulary.decoration_vocabulary.encode(self.vocabulary.decoration_tokenizer.tokenize(dec))
if en_scaff is not None and en_dec is not None:
self._encoded_list.append((en_scaff, en_dec))

def __getitem__(self, i):
scaff, dec = self._encoded_list[i]
return (
torch.tensor(scaff, dtype=torch.long),
torch.tensor(dec, dtype=torch.long),
) # pylint: disable=E1102
return (torch.tensor(scaff, dtype=torch.long), torch.tensor(dec, dtype=torch.long)) # pylint: disable=E1102

def __len__(self):
return len(self._encoded_list)
Expand All @@ -83,9 +74,7 @@ def pad_batch(encoded_seqs):
:param encoded_seqs: A list of encoded sequences.
:return: A tensor with the sequences correctly padded.
"""
seq_lengths = torch.tensor(
[len(seq) for seq in encoded_seqs], dtype=torch.int64
) # pylint: disable=not-callable
seq_lengths = torch.tensor([len(seq) for seq in encoded_seqs], dtype=torch.int64) # pylint: disable=not-callable
if torch.cuda.is_available():
return (tnnur.pad_sequence(encoded_seqs, batch_first=True).cuda(), seq_lengths)
return (tnnur.pad_sequence(encoded_seqs, batch_first=True), seq_lengths)
132 changes: 38 additions & 94 deletions reinvent_models/lib_invent/models/decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
import torch.nn as tnn
import torch.nn.utils.rnn as tnnur

from reinvent_models.lib_invent.enums.generative_model_parameters import (
GenerativeModelParametersEnum,
)
from reinvent_models.lib_invent.enums.generative_model_parameters import GenerativeModelParametersEnum


class Encoder(tnn.Module):
Expand All @@ -27,16 +25,10 @@ def __init__(self, num_layers, num_dimensions, vocabulary_size, dropout):

self._embedding = tnn.Sequential(
tnn.Embedding(self.vocabulary_size, self.num_dimensions),
tnn.Dropout(dropout),
)
self._rnn = tnn.LSTM(
self.num_dimensions,
self.num_dimensions,
self.num_layers,
batch_first=True,
dropout=self.dropout,
bidirectional=True,
tnn.Dropout(dropout)
)
self._rnn = tnn.LSTM(self.num_dimensions, self.num_dimensions, self.num_layers,
batch_first=True, dropout=self.dropout, bidirectional=True)

def forward(self, padded_seqs, seq_lengths): # pylint: disable=arguments-differ
# FIXME: This fails with a batch of 1 because squeezing looses a dimension with size 1
Expand All @@ -53,40 +45,26 @@ def forward(self, padded_seqs, seq_lengths): # pylint: disable=arguments-differ
padded_seqs = self._embedding(padded_seqs)
hs_h, hs_c = (hidden_state, hidden_state.clone().detach())

# FIXME: this is to guard against non compatible `gpu` input for pack_padded_sequence() method in pytorch 1.7
#FIXME: this is to guard against non compatible `gpu` input for pack_padded_sequence() method in pytorch 1.7
seq_lengths = seq_lengths.cpu()

packed_seqs = tnnur.pack_padded_sequence(
padded_seqs, seq_lengths, batch_first=True, enforce_sorted=False
)
packed_seqs = tnnur.pack_padded_sequence(padded_seqs, seq_lengths, batch_first=True, enforce_sorted=False)
packed_seqs, (hs_h, hs_c) = self._rnn(packed_seqs, (hs_h, hs_c))
padded_seqs, _ = tnnur.pad_packed_sequence(packed_seqs, batch_first=True)

# sum up bidirectional layers and collapse
hs_h = (
hs_h.view(self.num_layers, 2, batch_size, self.num_dimensions)
.sum(dim=1)
.squeeze()
) # (layers, batch, dim)
hs_c = (
hs_c.view(self.num_layers, 2, batch_size, self.num_dimensions)
.sum(dim=1)
.squeeze()
) # (layers, batch, dim)
padded_seqs = (
padded_seqs.view(batch_size, max_seq_size, 2, self.num_dimensions)
.sum(dim=2)
.squeeze()
) # (batch, seq, dim)
hs_h = hs_h.view(self.num_layers, 2, batch_size, self.num_dimensions)\
.sum(dim=1).squeeze() # (layers, batch, dim)
hs_c = hs_c.view(self.num_layers, 2, batch_size, self.num_dimensions)\
.sum(dim=1).squeeze() # (layers, batch, dim)
padded_seqs = padded_seqs.view(batch_size, max_seq_size, 2, self.num_dimensions)\
.sum(dim=2).squeeze() # (batch, seq, dim)

return padded_seqs, (hs_h, hs_c)

def _initialize_hidden_state(self, batch_size):
if torch.cuda.is_available():
return torch.zeros(
self.num_layers * 2, batch_size, self.num_dimensions
).cuda()
return torch.zeros(self.num_layers * 2, batch_size, self.num_dimensions)
return torch.zeros(self.num_layers*2, batch_size, self.num_dimensions).cuda()

def get_params(self):
parameter_enums = GenerativeModelParametersEnum
Expand All @@ -98,23 +76,23 @@ def get_params(self):
parameter_enums.NUMBER_OF_LAYERS: self.num_layers,
parameter_enums.NUMBER_OF_DIMENSIONS: self.num_dimensions,
parameter_enums.VOCABULARY_SIZE: self.vocabulary_size,
parameter_enums.DROPOUT: self.dropout,
parameter_enums.DROPOUT: self.dropout
}


class AttentionLayer(tnn.Module):

def __init__(self, num_dimensions):
super(AttentionLayer, self).__init__()

self.num_dimensions = num_dimensions

self._attention_linear = tnn.Sequential(
tnn.Linear(self.num_dimensions * 2, self.num_dimensions), tnn.Tanh()
tnn.Linear(self.num_dimensions*2, self.num_dimensions),
tnn.Tanh()
)

def forward(
self, padded_seqs, encoder_padded_seqs, decoder_mask
): # pylint: disable=arguments-differ
def forward(self, padded_seqs, encoder_padded_seqs, decoder_mask): # pylint: disable=arguments-differ
"""
Performs the forward pass.
:param padded_seqs: A tensor with the output sequences (batch, seq_d, dim).
Expand All @@ -124,19 +102,12 @@ def forward(
"""
# scaled dot-product
# (batch, seq_d, 1, dim)*(batch, 1, seq_e, dim) => (batch, seq_d, seq_e*)
attention_weights = (
(padded_seqs.unsqueeze(dim=2) * encoder_padded_seqs.unsqueeze(dim=1))
.sum(dim=3)
.div(math.sqrt(self.num_dimensions))
attention_weights = (padded_seqs.unsqueeze(dim=2)*encoder_padded_seqs.unsqueeze(dim=1))\
.sum(dim=3).div(math.sqrt(self.num_dimensions))\
.softmax(dim=2)
)
# (batch, seq_d, seq_e*)@(batch, seq_e, dim) => (batch, seq_d, dim)
attention_context = attention_weights.bmm(encoder_padded_seqs)
return (
self._attention_linear(torch.cat([padded_seqs, attention_context], dim=2))
* decoder_mask,
attention_weights,
)
return (self._attention_linear(torch.cat([padded_seqs, attention_context], dim=2))*decoder_mask, attention_weights)


class Decoder(tnn.Module):
Expand All @@ -154,26 +125,16 @@ def __init__(self, num_layers, num_dimensions, vocabulary_size, dropout):

self._embedding = tnn.Sequential(
tnn.Embedding(self.vocabulary_size, self.num_dimensions),
tnn.Dropout(dropout),
)
self._rnn = tnn.LSTM(
self.num_dimensions,
self.num_dimensions,
self.num_layers,
batch_first=True,
dropout=self.dropout,
bidirectional=False,
tnn.Dropout(dropout)
)
self._rnn = tnn.LSTM(self.num_dimensions, self.num_dimensions, self.num_layers,
batch_first=True, dropout=self.dropout, bidirectional=False)

self._attention = AttentionLayer(self.num_dimensions)

self._linear = tnn.Linear(
self.num_dimensions, self.vocabulary_size
) # just to redimension
self._linear = tnn.Linear(self.num_dimensions, self.vocabulary_size) # just to redimension

def forward(
self, padded_seqs, seq_lengths, encoder_padded_seqs, hidden_states
): # pylint: disable=arguments-differ
def forward(self, padded_seqs, seq_lengths, encoder_padded_seqs, hidden_states): # pylint: disable=arguments-differ
"""
Performs the forward pass.
:param padded_seqs: A tensor with the output sequences (batch, seq_d, dim).
Expand All @@ -187,20 +148,13 @@ def forward(

padded_encoded_seqs = self._embedding(padded_seqs)
packed_encoded_seqs = tnnur.pack_padded_sequence(
padded_encoded_seqs, seq_lengths, batch_first=True, enforce_sorted=False
)
packed_encoded_seqs, hidden_states = self._rnn(
packed_encoded_seqs, hidden_states
)
padded_encoded_seqs, _ = tnnur.pad_packed_sequence(
packed_encoded_seqs, batch_first=True
) # (batch, seq, dim)
padded_encoded_seqs, seq_lengths, batch_first=True, enforce_sorted=False)
packed_encoded_seqs, hidden_states = self._rnn(packed_encoded_seqs, hidden_states)
padded_encoded_seqs, _ = tnnur.pad_packed_sequence(packed_encoded_seqs, batch_first=True) # (batch, seq, dim)

mask = (padded_encoded_seqs[:, :, 0] != 0).unsqueeze(dim=-1).type(torch.float)
attn_padded_encoded_seqs, attention_weights = self._attention(
padded_encoded_seqs, encoder_padded_seqs, mask
)
logits = self._linear(attn_padded_encoded_seqs) * mask # (batch, seq, voc_size)
attn_padded_encoded_seqs, attention_weights = self._attention(padded_encoded_seqs, encoder_padded_seqs, mask)
logits = self._linear(attn_padded_encoded_seqs)*mask # (batch, seq, voc_size)
return logits, hidden_states, attention_weights

def get_params(self):
Expand All @@ -213,7 +167,7 @@ def get_params(self):
parameter_enum.NUMBER_OF_LAYERS: self.num_layers,
parameter_enum.NUMBER_OF_DIMENSIONS: self.num_dimensions,
parameter_enum.VOCABULARY_SIZE: self.vocabulary_size,
parameter_enum.DROPOUT: self.dropout,
parameter_enum.DROPOUT: self.dropout
}


Expand All @@ -228,9 +182,7 @@ def __init__(self, encoder_params, decoder_params):
self._encoder = Encoder(**encoder_params)
self._decoder = Decoder(**decoder_params)

def forward(
self, encoder_seqs, encoder_seq_lengths, decoder_seqs, decoder_seq_lengths
): # pylint: disable=arguments-differ
def forward(self, encoder_seqs, encoder_seq_lengths, decoder_seqs, decoder_seq_lengths): # pylint: disable=arguments-differ
"""
Performs the forward pass.
:param encoder_seqs: A tensor with the output sequences (batch, seq_d, dim).
Expand All @@ -239,12 +191,8 @@ def forward(
:param decoder_seq_lengths: The lengths of the decoder sequences.
:return : The output logits as a tensor (batch, seq_d, dim).
"""
encoder_padded_seqs, hidden_states = self.forward_encoder(
encoder_seqs, encoder_seq_lengths
)
logits, _, _ = self.forward_decoder(
decoder_seqs, decoder_seq_lengths, encoder_padded_seqs, hidden_states
)
encoder_padded_seqs, hidden_states = self.forward_encoder(encoder_seqs, encoder_seq_lengths)
logits, _, _ = self.forward_decoder(decoder_seqs, decoder_seq_lengths, encoder_padded_seqs, hidden_states)
return logits

def forward_encoder(self, padded_seqs, seq_lengths):
Expand All @@ -256,19 +204,15 @@ def forward_encoder(self, padded_seqs, seq_lengths):
"""
return self._encoder(padded_seqs, seq_lengths)

def forward_decoder(
self, padded_seqs, seq_lengths, encoder_padded_seqs, hidden_states
):
def forward_decoder(self, padded_seqs, seq_lengths, encoder_padded_seqs, hidden_states):
"""
Does a forward pass only of the decoder.
:param hidden_states: The hidden states from the encoder.
:param padded_seqs: The data to feed to the decoder.
:param seq_lengths: The length of each sequence in the batch.
:return : Returns the logits and the hidden state for each element of the sequence passed.
"""
return self._decoder(
padded_seqs, seq_lengths, encoder_padded_seqs, hidden_states
)
return self._decoder(padded_seqs, seq_lengths, encoder_padded_seqs, hidden_states)

def get_params(self):
"""
Expand All @@ -277,5 +221,5 @@ def get_params(self):
"""
return {
"encoder_params": self._encoder.get_params(),
"decoder_params": self._decoder.get_params(),
"decoder_params": self._decoder.get_params()
}
Loading

0 comments on commit e1cf00d

Please sign in to comment.