diff --git a/src/grouped_sampling/batch_end_to_end_pipeline.py b/src/grouped_sampling/batch_end_to_end_pipeline.py index 95bd372..6ce7841 100644 --- a/src/grouped_sampling/batch_end_to_end_pipeline.py +++ b/src/grouped_sampling/batch_end_to_end_pipeline.py @@ -1,7 +1,7 @@ from typing import List, Optional import torch -from torch import Tensor, long, inference_mode, full, argmax, int8, eq, ones_like +from torch import Tensor, argmax, eq, full, inference_mode, int8, long, ones_like from transformers import GenerationConfig from src.grouped_sampling.logits_vec_to_token import LogitVectorToTokenPipeLine @@ -10,12 +10,13 @@ class BatchEndToEndSingleSequencePipeLine: + def __init__( - self, - model_name: str, - load_in_8bit: bool = False, - model_kwargs: Optional[dict] = None, - generation_config: Optional[GenerationConfig] = None, + self, + model_name: str, + load_in_8bit: bool = False, + model_kwargs: Optional[dict] = None, + generation_config: Optional[GenerationConfig] = None, ): """ Create a new BatchEndToEndSingleSequencePipeLine. @@ -33,15 +34,22 @@ def __init__( TypeError: If one of the arguments is of the wrong type. """ if not isinstance(model_name, str): - raise TypeError(f"model_name should be a string, got {type(model_name)}") + raise TypeError( + f"model_name should be a string, got {type(model_name)}") if not isinstance(load_in_8bit, bool): - raise TypeError(f"load_in_8bit should be a bool, got {type(load_in_8bit)}") + raise TypeError( + f"load_in_8bit should be a bool, got {type(load_in_8bit)}") if model_kwargs is not None and not isinstance(model_kwargs, dict): - raise TypeError(f"model_kwargs should be a dict or None, got {type(model_kwargs)}") - if generation_config is not None and not isinstance(generation_config, GenerationConfig): - raise TypeError(f"generation_config should be a GenerationConfig or None, got {type(generation_config)}") + raise TypeError( + f"model_kwargs should be a dict or None, got {type(model_kwargs)}" + ) + if generation_config is not None and not isinstance( + generation_config, GenerationConfig): + raise TypeError( + f"generation_config should be a GenerationConfig or None, got {type(generation_config)}" + ) if not load_in_8bit: - torch.set_float32_matmul_precision('high') + torch.set_float32_matmul_precision("high") self.tokenizer = get_tokenizer(model_name) if model_kwargs is None: self.model = get_model( @@ -57,18 +65,23 @@ def __init__( self.device: torch.device = self.model.device self.max_total_len = self.model.config.max_position_embeddings if generation_config is None: - generation_config = GenerationConfig.from_model_config(self.model.config) - self.logit_to_token_pipeline = LogitVectorToTokenPipeLine(generation_config=generation_config) + generation_config = GenerationConfig.from_model_config( + self.model.config) + self.logit_to_token_pipeline = LogitVectorToTokenPipeLine( + generation_config=generation_config) def tokenize_and_pad( - self, - prompts: List[str], - output_length: int, + self, + prompts: List[str], + output_length: int, ) -> Tensor: """A helper function that converts a list of strings to a padded tensor of tokens.""" prompt_tokens_list = self.tokenizer.batch_encode_plus( - prompts, add_special_tokens=True, padding=True, return_attention_mask=False, - )['input_ids'] + prompts, + add_special_tokens=True, + padding=True, + return_attention_mask=False, + )["input_ids"] prompt_tokens = torch.tensor( prompt_tokens_list, device=self.device, @@ -77,7 +90,9 @@ def tokenize_and_pad( max_input_length = prompt_tokens.shape[1] padding_length: int = max_input_length + output_length - 1 if padding_length > self.max_total_len: - raise ValueError(f"padding_length should be at most {self.max_total_len}, got {padding_length}") + raise ValueError( + f"padding_length should be at most {self.max_total_len}, got {padding_length}" + ) batch_size = prompt_tokens.shape[0] extra_padding = full( fill_value=self.tokenizer.pad_token_id, @@ -88,9 +103,9 @@ def tokenize_and_pad( return torch.cat([prompt_tokens, extra_padding], dim=1) def tokens_batch_to_logit_matrices( - self, - padded_tokens: Tensor, - output_length: int, + self, + padded_tokens: Tensor, + output_length: int, ) -> List[Tensor]: """ Given a batch of prompts where each prompt is a sequence of tokens, and an output_length, @@ -98,21 +113,29 @@ def tokens_batch_to_logit_matrices( where logits[i] is the logits matrix of the i-th prompt. """ if padded_tokens.dim() != 2: - raise ValueError(f"tokens should be a 2D tensor, got {padded_tokens.dim()}D tensor") + raise ValueError( + f"tokens should be a 2D tensor, got {padded_tokens.dim()}D tensor" + ) if padded_tokens.requires_grad: raise ValueError("tokens should not require grad") if not isinstance(output_length, int): - raise TypeError(f"output_length should be an int, got {type(output_length)}") + raise TypeError( + f"output_length should be an int, got {type(output_length)}") if output_length <= 0: - raise ValueError(f"output_length should be positive, got {output_length}") - attenction_mask = ones_like(padded_tokens, dtype=torch.long, device=self.device, requires_grad=False) + raise ValueError( + f"output_length should be positive, got {output_length}") + attenction_mask = ones_like(padded_tokens, + dtype=torch.long, + device=self.device, + requires_grad=False) all_logits = self.model( output_attentions=False, output_hidden_states=False, input_ids=padded_tokens, attention_mask=attenction_mask, ).logits - padding_int_tokens = eq(padded_tokens, self.tokenizer.pad_token_id).to(int8) + padding_int_tokens = eq(padded_tokens, + self.tokenizer.pad_token_id).to(int8) last_non_pad_indices = argmax(padding_int_tokens, dim=1) - 1 relavent_logits = [ all_logits[i, index:index + output_length] @@ -122,11 +145,15 @@ def tokens_batch_to_logit_matrices( def _validate_output_length(self, output_length: int) -> None: if not isinstance(output_length, int): - raise TypeError(f"output_length should be an int, got {type(output_length)}") + raise TypeError( + f"output_length should be an int, got {type(output_length)}") if output_length <= 0: - raise ValueError(f"output_length should be positive, got {output_length}") + raise ValueError( + f"output_length should be positive, got {output_length}") if output_length >= self.max_total_len: - raise ValueError(f"output_length should be smaller than {self.max_total_len}, got {output_length}") + raise ValueError( + f"output_length should be smaller than {self.max_total_len}, got {output_length}" + ) @staticmethod def _validate_prompts(prompts: List[str]): @@ -137,9 +164,9 @@ def _validate_prompts(prompts: List[str]): @inference_mode() def genearte_batch( - self, - prompts: List[str] | str, - output_length: int, + self, + prompts: List[str] | str, + output_length: int, ) -> List[str]: """ Given a batch of prompts and output length, generates a list of output strings. @@ -160,9 +187,11 @@ def genearte_batch( return [] self._validate_prompts(prompts) padded_tokens = self.tokenize_and_pad(prompts, output_length) - logits = self.tokens_batch_to_logit_matrices(padded_tokens, output_length) + logits = self.tokens_batch_to_logit_matrices(padded_tokens, + output_length) output_tokens = self.logit_to_token_pipeline.batch_to_tokens( input_ids=padded_tokens, batch=logits, ) - return self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True) + return self.tokenizer.batch_decode(output_tokens, + skip_special_tokens=True) diff --git a/src/grouped_sampling/logits_vec_to_token.py b/src/grouped_sampling/logits_vec_to_token.py index ad1d754..6a4c4ae 100644 --- a/src/grouped_sampling/logits_vec_to_token.py +++ b/src/grouped_sampling/logits_vec_to_token.py @@ -2,8 +2,13 @@ from typing import List import torch -from torch import Tensor, long, FloatTensor, multinomial, argmax, exp -from transformers import GenerationConfig, LogitsProcessorList, GenerationMixin, LogitsProcessor +from torch import FloatTensor, Tensor, argmax, exp, long, multinomial +from transformers import ( + GenerationConfig, + GenerationMixin, + LogitsProcessor, + LogitsProcessorList, +) from transformers.generation import LogitNormalization @@ -21,8 +26,10 @@ def __call__(self, input_ids: Tensor, scores: Tensor) -> Tensor: class LogitVectorToTokenPipeLine: + @staticmethod - def prepare_generation_config(generation_config: GenerationConfig) -> GenerationConfig: + def prepare_generation_config( + generation_config: GenerationConfig, ) -> GenerationConfig: """ Prepare a generation config for the LogitVectorToTokenPipeLine. Args: @@ -33,35 +40,41 @@ def prepare_generation_config(generation_config: GenerationConfig) -> Generation generation_config_copy = copy.deepcopy(generation_config) generation_config_copy.renormalize_logits = True generation_config_copy.num_beams = 1 - required_attrs = ("epsilon_cutoff", "temperature", "top_k", "top_p", "typical_p", "eta_cutoff") + required_attrs = ( + "epsilon_cutoff", + "temperature", + "top_k", + "top_p", + "typical_p", + "eta_cutoff", + ) for attr in required_attrs: if not hasattr(generation_config_copy, attr): setattr(generation_config_copy, attr, None) return generation_config_copy def __init__( - self, - generation_config: GenerationConfig, + self, + generation_config: GenerationConfig, ): - generation_config_copy = self.prepare_generation_config(generation_config) + generation_config_copy = self.prepare_generation_config( + generation_config) mixin = GenerationMixin() mixin.generation_config = generation_config_copy # noinspection PyProtectedMember - self.logit_wrapper: LogitsProcessorList = mixin._get_logits_warper(generation_config_copy) + self.logit_wrapper: LogitsProcessorList = mixin._get_logits_warper( + generation_config_copy) self.do_sample = generation_config_copy.do_sample if isinstance(self.logit_wrapper[-1], LogitNormalization): self.logit_wrapper.pop(-1) if self.do_sample: - softmax = SoftmaxLogitNormalization(temperature=generation_config_copy.temperature) + softmax = SoftmaxLogitNormalization( + temperature=generation_config_copy.temperature) self.logit_wrapper.append(softmax) - def single_logit_vector_to_token( - self, - input_ids: Tensor, - logits: FloatTensor, - **kwargs - ) -> long: + def single_logit_vector_to_token(self, input_ids: Tensor, + logits: FloatTensor, **kwargs) -> long: """ Convert a single logit vector to a token id. args: @@ -69,28 +82,33 @@ def single_logit_vector_to_token( logits: torch.FloatTensor of shape (vocab_size, ) with the logits for the next token. """ if input_ids.dim() != 1 or input_ids.shape[0] == 0: - raise ValueError(f"input_ids should be a 1D long tensor. " - f"Got input ids with shape {input_ids.shape} and dimention {input_ids.dim()}") + raise ValueError( + f"input_ids should be a 1D long tensor. " + f"Got input ids with shape {input_ids.shape} and dimention {input_ids.dim()}" + ) if logits.dim() == 1: logits = logits.unsqueeze(0) if logits.dim() != 2 or min(logits.shape) == 0: - raise ValueError(f"logits should be a 1D float tensor." - f"Got logits with shape {logits.shape} and dimention {logits.dim()}") - wrapped_logits = self.logit_wrapper(input_ids=input_ids, scores=logits, **kwargs) + raise ValueError( + f"logits should be a 1D float tensor." + f"Got logits with shape {logits.shape} and dimention {logits.dim()}" + ) + wrapped_logits = self.logit_wrapper(input_ids=input_ids, + scores=logits, + **kwargs) if self.do_sample: if not torch.all(wrapped_logits >= 0): - raise RuntimeError(f"Probabilities should be non-negative, got {wrapped_logits}") + raise RuntimeError( + f"Probabilities should be non-negative, got {wrapped_logits}" + ) if not torch.isclose(torch.sum(wrapped_logits), torch.tensor(1.0)): - raise RuntimeError(f"Probabilities should sum to 1.0, got {wrapped_logits}") + raise RuntimeError( + f"Probabilities should sum to 1.0, got {wrapped_logits}") return multinomial(wrapped_logits, num_samples=1) return argmax(wrapped_logits, dim=-1) - def logit_matrix_to_tokens( - self, - input_ids: Tensor, - logit_vectors: Tensor, - **kwargs - ) -> Tensor: + def logit_matrix_to_tokens(self, input_ids: Tensor, logit_vectors: Tensor, + **kwargs) -> Tensor: """ Convert multipule logit vectors to token ids in parallel. args: @@ -99,23 +117,31 @@ def logit_matrix_to_tokens( The input sequence is the same for all the logits. """ if input_ids.dim() != 1 or input_ids.shape[0] == 0: - raise ValueError(f"input_ids should be a 1D long tensor" - f"Got input ids with shape {input_ids.shape} and dimention {input_ids.dim()}") + raise ValueError( + f"input_ids should be a 1D long tensor" + f"Got input ids with shape {input_ids.shape} and dimention {input_ids.dim()}" + ) if logit_vectors.dim() != 2 or min(logit_vectors.shape) == 0: - raise ValueError(f"logits should be a 2D float tensor" - f"Got logits with shape {logit_vectors.shape} and dimention {logit_vectors.dim()}") + raise ValueError( + f"logits should be a 2D float tensor" + f"Got logits with shape {logit_vectors.shape} and dimention {logit_vectors.dim()}" + ) def logit_vector_to_token(vector) -> long: - return self.single_logit_vector_to_token(input_ids, vector, **kwargs) + return self.single_logit_vector_to_token(input_ids, vector, + **kwargs) - tokens = [logit_vector_to_token(logit_vector) for logit_vector in logit_vectors] + tokens = [ + logit_vector_to_token(logit_vector) + for logit_vector in logit_vectors + ] return torch.stack(tokens, dim=0).squeeze() def batch_to_tokens( - self, - input_ids: Tensor, - batch: List[Tensor], + self, + input_ids: Tensor, + batch: List[Tensor], ) -> List[Tensor]: """ Convert a batch of logit matrices to tokens. @@ -126,27 +152,44 @@ def batch_to_tokens( Returns: A list of Tensors with length output_seq_len with the output tokens for every sequence in the batch. """ - if not isinstance(input_ids, Tensor) or input_ids.dtype not in [torch.long, torch.int]: - raise ValueError(f"input_ids should be a Tensor of dtype int or long, got {input_ids}") + if not isinstance(input_ids, Tensor) or input_ids.dtype not in [ + torch.long, + torch.int, + ]: + raise ValueError( + f"input_ids should be a Tensor of dtype int or long, got {input_ids}" + ) if not isinstance(batch, list): raise ValueError(f"batch should be a a list, got {type(batch)}") if not all(isinstance(logit_matrix, Tensor) for logit_matrix in batch): - raise ValueError(f"batch should be a a list of Tensors, got a list of {type(batch[0])}") - if not all(logit_matrix.dtype in (torch.float, torch.float16) for logit_matrix in batch): - raise ValueError(f"batch should be a a list of Tensors of dtype float, got a list of {batch[0].dtype}") + raise ValueError( + f"batch should be a a list of Tensors, got a list of {type(batch[0])}" + ) + if not all(logit_matrix.dtype in (torch.float, torch.float16) + for logit_matrix in batch): + raise ValueError( + f"batch should be a a list of Tensors of dtype float, got a list of {batch[0].dtype}" + ) if input_ids.dim() != 2 or min(input_ids.shape) == 0: - raise ValueError(f"input_ids should be a 2D long tensor" - f"Got input ids with shape {input_ids.shape} and dimention {input_ids.dim()}") - if any(logit_matrix.dim() != 2 or min(logit_matrix.shape) == 0 for logit_matrix in batch): - raise ValueError(f"each logit matrix in batch should be a 2D float tensor" - f"Got batch: {batch}") + raise ValueError( + f"input_ids should be a 2D long tensor" + f"Got input ids with shape {input_ids.shape} and dimention {input_ids.dim()}" + ) + if any(logit_matrix.dim() != 2 or min(logit_matrix.shape) == 0 + for logit_matrix in batch): + raise ValueError( + f"each logit matrix in batch should be a 2D float tensor" + f"Got batch: {batch}") if len(batch) != input_ids.shape[0]: - raise ValueError(f"batch and input_ids should have the same batch size" - f"Got batch with size {len(batch)} and input_ids with shape {input_ids.shape}") + raise ValueError( + f"batch and input_ids should have the same batch size" + f"Got batch with size {len(batch)} and input_ids with shape {input_ids.shape}" + ) all_output_seqs = [] for logit_matrix, curr_sequence in zip(batch, input_ids): - curr_output_seq = torch.stack( - [self.single_logit_vector_to_token(curr_sequence, logit_vector) for logit_vector in logit_matrix], - ).squeeze() + curr_output_seq = torch.stack([ + self.single_logit_vector_to_token(curr_sequence, logit_vector) + for logit_vector in logit_matrix + ], ).squeeze() all_output_seqs.append(curr_output_seq) return all_output_seqs diff --git a/src/grouped_sampling/model.py b/src/grouped_sampling/model.py index d064f6c..05a22dc 100644 --- a/src/grouped_sampling/model.py +++ b/src/grouped_sampling/model.py @@ -1,7 +1,7 @@ from warnings import warn from huggingface_hub.utils import RepositoryNotFoundError -from torch import cuda, compile, inference_mode +from torch import compile, cuda, inference_mode from torch._dynamo import OptimizedModule from torch.cuda import OutOfMemoryError from transformers import AutoModelForCausalLM @@ -9,9 +9,9 @@ @inference_mode() def get_model( - model_name: str, - load_in_8bit: bool = False, - **kwargs, + model_name: str, + load_in_8bit: bool = False, + **kwargs, ) -> OptimizedModule: """ Load a model from the huggingface model hub, and compile it for faster inference. @@ -40,8 +40,7 @@ def get_model( f"Model {model_name} not found in the model hub.\n" "If you are trying to use a local model, make sure to use the full path.\n" "If you are trying to load a private model, make sure to pass your huggingface token." - + str(error) - ) + + str(error)) if cuda.is_available(): try: model = model.cuda() @@ -51,4 +50,4 @@ def get_model( warn("CUDA is not avilable, using the CPU instead") model = model.eval() model = compile(model) - return model \ No newline at end of file + return model diff --git a/src/grouped_sampling/tokenizer.py b/src/grouped_sampling/tokenizer.py index 7fd19e7..10f6125 100644 --- a/src/grouped_sampling/tokenizer.py +++ b/src/grouped_sampling/tokenizer.py @@ -1,23 +1,33 @@ import json import os from functools import lru_cache -from typing import Optional, Dict +from typing import Dict, Optional import requests -from huggingface_hub.utils import validate_repo_id, HFValidationError, RepositoryNotFoundError -from transformers import PreTrainedTokenizer, AutoTokenizer, PreTrainedTokenizerFast - - -def get_padding_id(tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast) -> int: - if hasattr(tokenizer, "pad_token_id") and tokenizer.pad_token_id is not None: +from huggingface_hub.utils import ( + HFValidationError, + RepositoryNotFoundError, + validate_repo_id, +) +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + + +def get_padding_id( + tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast) -> int: + if hasattr(tokenizer, + "pad_token_id") and tokenizer.pad_token_id is not None: return tokenizer.pad_token_id - if hasattr(tokenizer, "pad_token_ids") and tokenizer.pad_token_ids is not None: + if hasattr(tokenizer, + "pad_token_ids") and tokenizer.pad_token_ids is not None: return tokenizer.pad_token_ids[0] - if hasattr(tokenizer, "mask_token_id") and tokenizer.mask_token_id is not None: + if hasattr(tokenizer, + "mask_token_id") and tokenizer.mask_token_id is not None: return tokenizer.mask_token_id - if hasattr(tokenizer, "mask_token_ids") and tokenizer.mask_token_ids is not None: + if hasattr(tokenizer, + "mask_token_ids") and tokenizer.mask_token_ids is not None: return tokenizer.mask_token_ids[0] - if hasattr(tokenizer, "pad_token_type_id") and tokenizer.pad_token_type_id is not None: + if (hasattr(tokenizer, "pad_token_type_id") + and tokenizer.pad_token_type_id is not None): return tokenizer.pad_token_type_id if hasattr(tokenizer, "_pad_token") and tokenizer.pad_token is not None: return int(tokenizer.pad_token) @@ -25,8 +35,7 @@ def get_padding_id(tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast) -> "Could not find padding id in a tokenizer with the following attributes: " f"{tokenizer.__dict__.keys()}" "Please make sure that the tokenizer has the following attributes: " - "pad_token_id, pad_token_ids, mask_token_id, mask_token_ids" - ) + "pad_token_id, pad_token_ids, mask_token_id, mask_token_ids") def get_model_config(model_name: str) -> Optional[dict]: @@ -34,7 +43,8 @@ def get_model_config(model_name: str) -> Optional[dict]: base_url = f"https://huggingface.co/{model_name}/raw/main/config.json" try: config_json_string: str = requests.get(base_url).text - except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout): + except (requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout): return None return json.loads(config_json_string) @@ -46,7 +56,8 @@ def get_tokenizer_config(model_name: str) -> Optional[dict]: base_url = f"https://huggingface.co/{model_name}/raw/main/tokenizer_config.json" try: config_json_string: str = requests.get(base_url).text - except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout): + except (requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout): return None return json.loads(config_json_string) @@ -56,13 +67,13 @@ def is_valid_model_name(model_name: str) -> bool: if "output" in model_name: return False if model_name in { - ".", - "model_medium_300_hf", - "DialoGPT-small-house", - "emo-bot", - "sberbank-ai/rugpt3_medium_gpt2_based", - "Grossmend/rudialogpt3_medium_based_on_gpt2", - "workflow" + ".", + "model_medium_300_hf", + "DialoGPT-small-house", + "emo-bot", + "sberbank-ai/rugpt3_medium_gpt2_based", + "Grossmend/rudialogpt3_medium_based_on_gpt2", + "workflow", }: return False try: @@ -82,7 +93,8 @@ def get_model_name_from_repo(repo_id: str) -> str: repo_id_no_org = repo_id.split("/")[-1] if model_config is not None and "_name_or_path" in model_config.keys(): config_model_name = model_config["_name_or_path"] - if is_valid_model_name(config_model_name) and config_model_name != repo_id_no_org: + if (is_valid_model_name(config_model_name) + and config_model_name != repo_id_no_org): return config_model_name return repo_id @@ -93,32 +105,33 @@ def get_tokenizer_name_from_repo(repo_id: str) -> str: except Exception: return repo_id repo_id_no_org = repo_id.split("/")[-1] - if tokenizer_config is not None and "name_or_path" in tokenizer_config.keys(): + if tokenizer_config is not None and "name_or_path" in tokenizer_config.keys( + ): config_model_name = tokenizer_config["name_or_path"] - if is_valid_model_name(config_model_name) and config_model_name != repo_id_no_org: + if (is_valid_model_name(config_model_name) + and config_model_name != repo_id_no_org): return config_model_name return repo_id @lru_cache(maxsize=1) def get_special_cases() -> Dict[str, str]: - special_cases_file = os.path.join(os.path.dirname(__file__), "model_to_tokenizer.json") + special_cases_file = os.path.join(os.path.dirname(__file__), + "model_to_tokenizer.json") with open(special_cases_file) as f: return json.loads(f.read()) # noinspection PyProtectedMember -def get_tokenizer_name( - model_name: str, -) -> str: +def get_tokenizer_name(model_name: str, ) -> str: """Returns a tokenizer name based on the model name""" special_cases = get_special_cases() if model_name in special_cases.keys(): return special_cases[model_name] - if model_name.startswith("Aleksandar1932/gpt2") \ - or model_name.startswith("Azaghast/GPT2") \ - or model_name.startswith("SteveC/sdc_bot") \ - or model_name.startswith("benjamin/gpt2-wechsel-"): + if (model_name.startswith("Aleksandar1932/gpt2") + or model_name.startswith("Azaghast/GPT2") + or model_name.startswith("SteveC/sdc_bot") + or model_name.startswith("benjamin/gpt2-wechsel-")): return "gpt2" tokenizer_name_from_repo = get_tokenizer_name_from_repo(model_name) if tokenizer_name_from_repo != model_name: @@ -130,8 +143,7 @@ def get_tokenizer_name( def get_tokenizer( - model_name: str, -) -> PreTrainedTokenizer | PreTrainedTokenizerFast: + model_name: str, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast: """ Returns a tokenizer based on the model name Args: @@ -153,8 +165,9 @@ def get_tokenizer( raise RepositoryNotFoundError( f"Model {model_name} not found in huggingfacehub. Local tokenizers are not supported yet.\n" f"If {model_name} is a tokenizers model, please make sure you are logged in.\n" - f"If {model_name} is a tokenizers model, please make sure it exists.\n" + str(error) - ) - if not hasattr(raw_tokenizer, "pad_token_id") or raw_tokenizer.pad_token_id is None: + f"If {model_name} is a tokenizers model, please make sure it exists.\n" + + str(error)) + if not hasattr(raw_tokenizer, + "pad_token_id") or raw_tokenizer.pad_token_id is None: raw_tokenizer.pad_token_id = get_padding_id(raw_tokenizer) return raw_tokenizer diff --git a/tests/test_batch_end_to_end_pipeline.py b/tests/test_batch_end_to_end_pipeline.py index 82e021d..7551ad7 100644 --- a/tests/test_batch_end_to_end_pipeline.py +++ b/tests/test_batch_end_to_end_pipeline.py @@ -1,19 +1,26 @@ import os from typing import List -# Generated by CodiumAI - import pytest import torch from huggingface_hub.utils import RepositoryNotFoundError -from torch import inference_mode, Tensor, float32, long +from torch import Tensor, float32, inference_mode, long + # noinspection PyProtectedMember from torch._dynamo import OptimizedModule -from transformers import AutoConfig, PreTrainedTokenizer, PreTrainedTokenizerFast, GenerationConfig - -from src.grouped_sampling.batch_end_to_end_pipeline import BatchEndToEndSingleSequencePipeLine +from transformers import ( + AutoConfig, + GenerationConfig, + PreTrainedTokenizer, + PreTrainedTokenizerFast, +) + +from src.grouped_sampling.batch_end_to_end_pipeline import ( + BatchEndToEndSingleSequencePipeLine, +) from src.grouped_sampling.logits_vec_to_token import LogitVectorToTokenPipeLine +# Generated by CodiumAI """ Code Analysis @@ -36,7 +43,11 @@ """ -def validate_logits(pipeleine: BatchEndToEndSingleSequencePipeLine, logits: List[Tensor], output_length: int) -> None: +def validate_logits( + pipeleine: BatchEndToEndSingleSequencePipeLine, + logits: List[Tensor], + output_length: int, +) -> None: if not isinstance(logits, list): raise TypeError(f"logits should be a list, got {type(logits)}") if len(logits) == 0: @@ -44,58 +55,88 @@ def validate_logits(pipeleine: BatchEndToEndSingleSequencePipeLine, logits: List if not all(isinstance(logit, Tensor) for logit in logits): raise TypeError("logits should be a list of tensors") if not all(logit.device == pipeleine.device for logit in logits): - raise ValueError(f"logits should be on device {pipeleine.device}, got {logits[0].device} for some logit") + raise ValueError( + f"logits should be on device {pipeleine.device}, got {logits[0].device} for some logit" + ) if not all(logit.dtype == float32 for logit in logits): - raise ValueError(f"logits should have dtype {float32}, got {logits[0].dtype} for some logit") + raise ValueError( + f"logits should have dtype {float32}, got {logits[0].dtype} for some logit" + ) if not all(logit.dim() == 2 for logit in logits): - raise ValueError(f"logits should be 2D tensors, got {logits[0].dim()}D tensor for some logit") + raise ValueError( + f"logits should be 2D tensors, got {logits[0].dim()}D tensor for some logit" + ) if not all(logit.shape[0] <= output_length for logit in logits): raise ValueError(f"logits should have at most {output_length} rows") - if not all(logit.shape[1] == pipeleine.tokenizer.vocab_size for logit in logits): - raise ValueError(f"logits should have {pipeleine.tokenizer.vocab_size} columns") + if not all(logit.shape[1] == pipeleine.tokenizer.vocab_size + for logit in logits): + raise ValueError( + f"logits should have {pipeleine.tokenizer.vocab_size} columns") -def validate_padded_tokens(pipeline: BatchEndToEndSingleSequencePipeLine, padded_tokens: Tensor) -> None: +def validate_padded_tokens(pipeline: BatchEndToEndSingleSequencePipeLine, + padded_tokens: Tensor) -> None: if padded_tokens.dim() != 2: - raise ValueError(f"tokens should be a 2D tensor, got {padded_tokens.dim()}D tensor") + raise ValueError( + f"tokens should be a 2D tensor, got {padded_tokens.dim()}D tensor") if padded_tokens.requires_grad: raise ValueError("tokens should not require grad") if padded_tokens.shape[1] > pipeline.max_total_len: - raise ValueError(f"tokens should have at most {pipeline.max_total_len} columns, got {padded_tokens.shape[1]}") + raise ValueError( + f"tokens should have at most {pipeline.max_total_len} columns, got {padded_tokens.shape[1]}" + ) if min(padded_tokens.shape) == 0: raise ValueError("tokens should not be empty") if padded_tokens.dtype != long: - raise ValueError(f"tokens should have dtype {long}, got {padded_tokens.dtype}") - if not all(0 <= token < pipeline.tokenizer.vocab_size for token in padded_tokens.flatten()): + raise ValueError( + f"tokens should have dtype {long}, got {padded_tokens.dtype}") + if not all(0 <= token < pipeline.tokenizer.vocab_size + for token in padded_tokens.flatten()): raise ValueError("tokens should be valid token ids") if padded_tokens.device != pipeline.device: - raise ValueError(f"tokens should be on device {pipeline.device}, got {padded_tokens.device}") - if any(padded_tokens[i, -1] == padded_tokens[i, 0] for i in range(padded_tokens.shape[0])): + raise ValueError( + f"tokens should be on device {pipeline.device}, got {padded_tokens.device}" + ) + if any(padded_tokens[i, -1] == padded_tokens[i, 0] + for i in range(padded_tokens.shape[0])): raise ValueError("The first token can never be the padding token") -def validate_output_tokens(pipeline: BatchEndToEndSingleSequencePipeLine, output_tokens: List[Tensor], - output_length: int) -> None: +def validate_output_tokens( + pipeline: BatchEndToEndSingleSequencePipeLine, + output_tokens: List[Tensor], + output_length: int, +) -> None: if not isinstance(output_tokens, list): - raise TypeError(f"output_tokens should be a list, got {type(output_tokens)}") + raise TypeError( + f"output_tokens should be a list, got {type(output_tokens)}") if len(output_tokens) == 0: raise ValueError("output_tokens should not be empty") if not all(isinstance(tokens, Tensor) for tokens in output_tokens): raise TypeError("output_tokens should be a list of tensors") if not all(tokens.device == pipeline.device for tokens in output_tokens): raise ValueError( - f"output_tokens should be on device {pipeline.device}, got {output_tokens[0].device} for some tokens") + f"output_tokens should be on device {pipeline.device}, got {output_tokens[0].device} for some tokens" + ) if not all(tokens.dtype == long for tokens in output_tokens): - raise ValueError(f"output_tokens should have dtype {long}, got {output_tokens[0].dtype} for some tokens") + raise ValueError( + f"output_tokens should have dtype {long}, got {output_tokens[0].dtype} for some tokens" + ) if not all(tokens.dim() == 1 for tokens in output_tokens): - raise ValueError(f"output_tokens should be 1D tensors, got {output_tokens[0].dim()}D tensor for some tokens") - if not all(0 <= token < pipeline.tokenizer.vocab_size for sequence in output_tokens for token in sequence): + raise ValueError( + f"output_tokens should be 1D tensors, got {output_tokens[0].dim()}D tensor for some tokens" + ) + if not all(0 <= token < pipeline.tokenizer.vocab_size + for sequence in output_tokens for token in sequence): raise ValueError("output_tokens should be valid token ids") - if not all(sequence.shape[0] <= output_length for sequence in output_tokens): - raise ValueError("output_tokens should have at most output_length tokens") + if not all(sequence.shape[0] <= output_length + for sequence in output_tokens): + raise ValueError( + "output_tokens should have at most output_length tokens") class TestBatchEndToEndSingleSequencePipeLine: + @staticmethod def setup_method(): os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -107,82 +148,90 @@ def setup_method(): # noinspection PyUnresolvedReferences torch._dynamo.config.verbose = True import bitsandbytes - assert bitsandbytes.COMPILED_WITH_CUDA, "bitsandbytes was not compiled with CUDA" + + assert (bitsandbytes.COMPILED_WITH_CUDA + ), "bitsandbytes was not compiled with CUDA" # Tests that the function returns a list of output strings for a batch of prompts with positive output length def test_happy_path(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') - prompts = ['Hello', 'How are you?'] + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") + prompts = ["Hello", "How are you?"] output_length = 5 result = pipeline.genearte_batch(prompts, output_length) assert isinstance(result, list) assert len(result) == len(prompts) for output in result: assert isinstance(output, str), f"{output} is not a string" - assert len(output) >= output_length, f"{len(output)} > {output_length}" + assert len( + output) >= output_length, f"{len(output)} > {output_length}" # Each token is at least 1 character long output_tokens = pipeline.tokenizer.encode(output) rebuilt_output = pipeline.tokenizer.decode(output_tokens) assert rebuilt_output == output, f"{rebuilt_output} != {output}" - assert len(output_tokens) <= output_length, f"{len(output_tokens)} != {output_length}" + assert ( + len(output_tokens) + <= output_length), f"{len(output_tokens)} != {output_length}" # Tests that the function returns an empty list for an empty batch def test_empty_prompts(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") prompts = [] output_length = 5 expected_output = [] - assert pipeline.genearte_batch(prompts, output_length) == expected_output + assert pipeline.genearte_batch(prompts, + output_length) == expected_output # Tests that the function returns a list of empty strings for a batch of prompts with output length 0 def test_empty_output_length(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') - prompts = ['Hello', 'How are you?'] + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") + prompts = ["Hello", "How are you?"] output_length = 0 - expected_output = ['', ''] - assert pipeline.genearte_batch(prompts, output_length) == expected_output + expected_output = ["", ""] + assert pipeline.genearte_batch(prompts, + output_length) == expected_output # Tests that the function returns a list of empty strings for an empty list of prompts def test_empty_prompts_list(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') - prompts = [''] + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") + prompts = [""] output_length = 5 with pytest.raises(ValueError): pipeline.genearte_batch(prompts, output_length) # Tests that the function raises a ValueError if the prompts contain an empty string def test_empty_string_in_prompts(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') - prompts = ['Hello', ''] + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") + prompts = ["Hello", ""] output_length = 5 with pytest.raises(ValueError): pipeline.genearte_batch(prompts, output_length) # Tests that the function raises a ValueError if output_length is negative def test_negative_output_length(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') - prompts = ['Hello', 'How are you?'] + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") + prompts = ["Hello", "How are you?"] output_length = -1 with pytest.raises(ValueError): pipeline.genearte_batch(prompts, output_length) # Tests that the function raises a ValueError if output_length is not an integer def test_non_integer_output_length(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') - prompts = ['Hello', 'How are you?'] + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") + prompts = ["Hello", "How are you?"] output_length = 1.5 with pytest.raises(TypeError): # noinspection PyTypeChecker pipeline.genearte_batch(prompts, output_length) @inference_mode def test_step_by_step_pipeline(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') - prompts = ['Hello', 'How are you?'] + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") + prompts = ["Hello", "How are you?"] output_length = 5 padded_tokens = pipeline.tokenize_and_pad(prompts, output_length) validate_padded_tokens(pipeline, padded_tokens) assert not padded_tokens.requires_grad - logits = pipeline.tokens_batch_to_logit_matrices(padded_tokens, output_length) + logits = pipeline.tokens_batch_to_logit_matrices( + padded_tokens, output_length) validate_logits(pipeline, logits, output_length) output_tokens = pipeline.logit_to_token_pipeline.batch_to_tokens( input_ids=padded_tokens, @@ -192,43 +241,49 @@ def test_step_by_step_pipeline(self): # Tests that the function raises a ValueError if output_length is too large def test_huge_output_length(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') - prompts = ['Hello', 'How are you?'] + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") + prompts = ["Hello", "How are you?"] output_length = 1000000 with pytest.raises(ValueError): pipeline.genearte_batch(prompts, output_length) # test that genearte_batch works correctrly when it gets a string as input def test_string_input(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') - prompt = 'Hello' + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") + prompt = "Hello" output_length = 5 result = pipeline.genearte_batch(prompt, output_length) assert isinstance(result, list) assert len(result) == 1 for output in result: assert isinstance(output, str), f"{output} is not a string" - assert len(output) >= output_length, f"{len(output)} > {output_length}" + assert len( + output) >= output_length, f"{len(output)} > {output_length}" # Each token is at least 1 character long output_tokens = pipeline.tokenizer.encode(output) rebuilt_output = pipeline.tokenizer.decode(output_tokens) assert rebuilt_output == output, f"{rebuilt_output} != {output}" - assert len(output_tokens) <= output_length, f"{len(output_tokens)} != {output_length}" + assert ( + len(output_tokens) + <= output_length), f"{len(output_tokens)} != {output_length}" def test_init(self): - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2') + pipeline = BatchEndToEndSingleSequencePipeLine("gpt2") self.validate_pipeline(pipeline) def test_init_non_existing_model(self): with pytest.raises(RepositoryNotFoundError): - BatchEndToEndSingleSequencePipeLine('non_existing_model') + BatchEndToEndSingleSequencePipeLine("non_existing_model") @inference_mode() def test_init_8bits_model(self): import bitsandbytes - assert bitsandbytes.COMPILED_WITH_CUDA, "bitsandbytes was not compiled with CUDA" - pipeline = BatchEndToEndSingleSequencePipeLine('fxmarty/tiny-llama-fast-tokenizer') - prompts = ['Hello', 'How are you?'] + + assert (bitsandbytes.COMPILED_WITH_CUDA + ), "bitsandbytes was not compiled with CUDA" + pipeline = BatchEndToEndSingleSequencePipeLine( + "fxmarty/tiny-llama-fast-tokenizer") + prompts = ["Hello", "How are you?"] padded_tokens = pipeline.tokenize_and_pad(prompts, 5) validate_padded_tokens(pipeline, padded_tokens) logits = pipeline.tokens_batch_to_logit_matrices(padded_tokens, 5) @@ -241,38 +296,48 @@ def test_init_8bits_model(self): # self.validate_pipeline(pipeline) def test_init_model_kwargs(self): - config = AutoConfig.from_pretrained('gpt2') + config = AutoConfig.from_pretrained("gpt2") config.output_hidden_states = True - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2', model_kwargs={'config': config}) + pipeline = BatchEndToEndSingleSequencePipeLine( + "gpt2", model_kwargs={"config": config}) self.validate_pipeline(pipeline) def test_init_generation_config(self): - config = GenerationConfig.from_pretrained('gpt2') + config = GenerationConfig.from_pretrained("gpt2") config.top_k = 10 config.top_p = 0.9 - pipeline = BatchEndToEndSingleSequencePipeLine('gpt2', generation_config=config) + pipeline = BatchEndToEndSingleSequencePipeLine( + "gpt2", generation_config=config) self.validate_pipeline(pipeline) @staticmethod @inference_mode() def validate_pipeline(pipeline): - assert pipeline.tokenizer is not None, 'tokenizer is None' - assert isinstance(pipeline.tokenizer, PreTrainedTokenizer) or isinstance(pipeline.tokenizer, - PreTrainedTokenizerFast), \ - 'tokenizer is not PreTrainedTokenizer or PreTrainedTokenizerFast' - assert pipeline.model is not None, 'model is None' - assert isinstance(pipeline.model, OptimizedModule), 'model is not OptimizedModule' - assert pipeline.logit_to_token_pipeline is not None, 'logit_to_token_pipeline is None' - assert isinstance(pipeline.logit_to_token_pipeline, LogitVectorToTokenPipeLine), \ - 'logit_to_token_pipeline is not LogitVectorToTokenPipeLine' - assert pipeline.max_total_len is not None, 'max_total_len is None' - assert isinstance(pipeline.max_total_len, int), 'max_total_len is not int' - assert pipeline.max_total_len > 0, 'max_total_len <= 0' - assert pipeline.device is not None, 'device is None' - assert isinstance(pipeline.device, torch.device), 'device is not torch.device' + assert pipeline.tokenizer is not None, "tokenizer is None" + assert isinstance( + pipeline.tokenizer, PreTrainedTokenizer + ) or isinstance( + pipeline.tokenizer, PreTrainedTokenizerFast + ), "tokenizer is not PreTrainedTokenizer or PreTrainedTokenizerFast" + assert pipeline.model is not None, "model is None" + assert isinstance(pipeline.model, + OptimizedModule), "model is not OptimizedModule" + assert (pipeline.logit_to_token_pipeline + is not None), "logit_to_token_pipeline is None" + assert isinstance( + pipeline.logit_to_token_pipeline, LogitVectorToTokenPipeLine + ), "logit_to_token_pipeline is not LogitVectorToTokenPipeLine" + assert pipeline.max_total_len is not None, "max_total_len is None" + assert isinstance(pipeline.max_total_len, + int), "max_total_len is not int" + assert pipeline.max_total_len > 0, "max_total_len <= 0" + assert pipeline.device is not None, "device is None" + assert isinstance(pipeline.device, + torch.device), "device is not torch.device" # assert that the device is cuda - assert pipeline.device.type == 'cuda', f"device is not cuda: {pipeline.device.type}" - prompt = 'Hello' + assert (pipeline.device.type == "cuda" + ), f"device is not cuda: {pipeline.device.type}" + prompt = "Hello" pipeline.genearte_batch(prompt, 5) # noinspection PyTypeChecker @@ -280,6 +345,6 @@ def test_init_wrong_types(self): with pytest.raises(TypeError): BatchEndToEndSingleSequencePipeLine(1) with pytest.raises(TypeError): - BatchEndToEndSingleSequencePipeLine('gpt2', model_kwargs=1) + BatchEndToEndSingleSequencePipeLine("gpt2", model_kwargs=1) with pytest.raises(TypeError): - BatchEndToEndSingleSequencePipeLine('gpt2', generation_config=1) + BatchEndToEndSingleSequencePipeLine("gpt2", generation_config=1) diff --git a/tests/test_get_model.py b/tests/test_get_model.py index 8afa733..c33d645 100644 --- a/tests/test_get_model.py +++ b/tests/test_get_model.py @@ -7,7 +7,8 @@ import pytest import torch from huggingface_hub.utils import RepositoryNotFoundError -from torch import inference_mode, cuda +from torch import cuda, inference_mode + # noinspection PyProtectedMember from torch._dynamo import OptimizedModule from torch.nn import Module @@ -46,46 +47,62 @@ class TestGetModel: + @staticmethod def validate_model(model): - assert isinstance(model, OptimizedModule), "The model is not an instance of OptimizedModule" - assert isinstance(model, Module), "The model is not an instance of torch.nn.Module" - assert hasattr(model, "config"), "The model does not have a config attribute" - assert isinstance(model.config, PretrainedConfig), "The model config is not an instance of PretrainedConfig" - assert hasattr(model, "device"), "The model does not have a device attribute" - assert isinstance(model.device, torch.device), "The model device is not an instance of torch.device" + assert isinstance( + model, + OptimizedModule), "The model is not an instance of OptimizedModule" + assert isinstance( + model, Module), "The model is not an instance of torch.nn.Module" + assert hasattr(model, + "config"), "The model does not have a config attribute" + assert isinstance( + model.config, PretrainedConfig + ), "The model config is not an instance of PretrainedConfig" + assert hasattr(model, + "device"), "The model does not have a device attribute" + assert isinstance( + model.device, torch.device + ), "The model device is not an instance of torch.device" if cuda.is_available(): - assert model.device.type == "cuda", "CUDA is avilable and the model device is not utilizing it" + assert ( + model.device.type == "cuda" + ), "CUDA is avilable and the model device is not utilizing it" else: - assert model.device.type == "cpu", "CUDA is not avilable and the model device is not on the CPU" + assert ( + model.device.type == "cpu" + ), "CUDA is not avilable and the model device is not on the CPU" assert isinstance(model, Callable), "The model is not callable" # Tests that the function returns a PreTrainedModel object def test_returns_model_object(self): - model = get_model('gpt2') + model = get_model("gpt2") self.validate_model(model) # Tests that the function loads the model with the correct kwargs def test_loads_correct_kwargs(self): - model_name = 'gpt2' - kwargs = {'output_hidden_states': True} + model_name = "gpt2" + kwargs = {"output_hidden_states": True} model = get_model(model_name, **kwargs) - assert model.config.output_hidden_states == kwargs['output_hidden_states'] + assert model.config.output_hidden_states == kwargs[ + "output_hidden_states"] self.validate_model(model) # Tests that the function loads the model in 8bit correctly def test_loads_model_in_8bit(self): - model_name = 'gpt2' + model_name = "gpt2" model = get_model(model_name, load_in_8bit=True) self.validate_model(model) @inference_mode() # Tests that the model gives the same output when getting the same input def test_model_gives_same_output(self): - model_name = 'gpt2' + model_name = "gpt2" model = get_model(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) - input_ids = tokenizer('Hello, my dog is cute', return_tensors='pt')["input_ids"] + input_ids = tokenizer("Hello, my dog is cute", + return_tensors="pt")["input_ids"] input_ids = input_ids.cuda() output1 = model(input_ids) logits1 = output1.logits @@ -99,27 +116,33 @@ def test_model_gives_same_output_with_and_without_batching(self): torch.manual_seed(0) random.seed(0) np.random.seed(0) - os.environ['TOKENIZERS_PARALLELISM'] = 'false' - model_name = 'fxmarty/tiny-llama-fast-tokenizer' + os.environ["TOKENIZERS_PARALLELISM"] = "false" + model_name = "fxmarty/tiny-llama-fast-tokenizer" model = get_model(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) - prompts = ['Hello, my dog is cute'] * 2 - input_ids_batch = tokenizer(prompts, return_tensors='pt')["input_ids"].cuda() + prompts = ["Hello, my dog is cute"] * 2 + input_ids_batch = tokenizer(prompts, + return_tensors="pt")["input_ids"].cuda() with torch.no_grad(): output_batch = model(input_ids_batch) logits_batch = output_batch.logits - input_ids = tokenizer(prompts[0], return_tensors='pt')["input_ids"].cuda() + input_ids = tokenizer(prompts[0], + return_tensors="pt")["input_ids"].cuda() with torch.no_grad(): output = model(input_ids) logits = output.logits assert torch.equal(logits_batch[0], logits_batch[1]) - assert logits_batch.shape == (2, input_ids.shape[1], model.config.vocab_size) + assert logits_batch.shape == (2, input_ids.shape[1], + model.config.vocab_size) assert logits.shape == (1, input_ids.shape[1], model.config.vocab_size) assert logits.dtype == logits_batch.dtype for i in range(input_ids.shape[1]): - assert torch.isclose(logits_batch[0][i], logits[0][i], atol=1e-5, rtol=1e-4).all(), \ - f'Index {i} does not match. got {logits_batch[0][i]} and {logits[0][i]}' + assert torch.isclose( + logits_batch[0][i], logits[0][i], atol=1e-5, rtol=1e-4 + ).all(), ( + f"Index {i} does not match. got {logits_batch[0][i]} and {logits[0][i]}" + ) def test_nonexistent_model(self): with pytest.raises(RepositoryNotFoundError): - get_model('nonexistent-model') + get_model("nonexistent-model") diff --git a/tests/test_get_padding_id.py b/tests/test_get_padding_id.py index ba3efaf..634b93d 100644 --- a/tests/test_get_padding_id.py +++ b/tests/test_get_padding_id.py @@ -3,7 +3,7 @@ import pytest from transformers import PreTrainedTokenizer -from src.grouped_sampling.tokenizer import get_tokenizer, get_padding_id +from src.grouped_sampling.tokenizer import get_padding_id, get_tokenizer def create_tokenizers() -> Generator[PreTrainedTokenizer, None, None]: diff --git a/tests/test_get_tokenizer.py b/tests/test_get_tokenizer.py index 7cecba1..114b8bb 100644 --- a/tests/test_get_tokenizer.py +++ b/tests/test_get_tokenizer.py @@ -1,4 +1,3 @@ - # Generated by CodiumAI import pytest @@ -32,18 +31,21 @@ - The function calls the 'get_padding_id' function to obtain the padding id for the tokenizer if it does not have a 'pad_token_id' attribute. - The function sets the 'pad_token_id' attribute of the tokenizer to the obtained padding id. """ + + class TestGetTokenizer: # Tests that a tokenizer is returned when given a valid model name def test_valid_model_name(self): - tokenizer = get_tokenizer('bert-base-uncased') - assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)) + tokenizer = get_tokenizer("bert-base-uncased") + assert isinstance(tokenizer, + (PreTrainedTokenizer, PreTrainedTokenizerFast)) # Tests that padding_side is set to 'right' when creating the tokenizer def test_padding_side(self): - tokenizer = get_tokenizer('bert-base-uncased') - assert tokenizer.padding_side == 'right' + tokenizer = get_tokenizer("bert-base-uncased") + assert tokenizer.padding_side == "right" # Tests that RepositoryNotFoundError is raised when model is not found in huggingfacehub or local cache def test_repo_not_found(self): with pytest.raises(RepositoryNotFoundError): - get_tokenizer('nonexistent-model') + get_tokenizer("nonexistent-model") diff --git a/tests/test_logit_to_token.py b/tests/test_logit_to_token.py index 2bf9ff8..9d3f594 100644 --- a/tests/test_logit_to_token.py +++ b/tests/test_logit_to_token.py @@ -2,7 +2,7 @@ import pytest import torch -from torch import LongTensor, FloatTensor, long, Tensor +from torch import FloatTensor, LongTensor, Tensor, long from transformers import GenerationConfig from src.grouped_sampling.logits_vec_to_token import LogitVectorToTokenPipeLine @@ -27,13 +27,15 @@ class TestLogitVectorToTokenPipeLine: exapmle_input_ids = LongTensor([1, 2, 3]).cuda() example_logits_vector = FloatTensor([0.1, 0.2, 0.7]).cuda() - example_logits_vectors = FloatTensor([[0.1, 0.2, 0.7], [0.3, 0.4, 0.3], [0.5, 0.1, 0.4]]).cuda() + example_logits_vectors = FloatTensor([[0.1, 0.2, 0.7], [0.3, 0.4, 0.3], + [0.5, 0.1, 0.4]]).cuda() # Tests that single_logit_vector_to_token returns a valid token id when given valid input_ids and logits def test_single_logit_vector_to_token_valid_input(self): generation_config = GenerationConfig() pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.single_logit_vector_to_token(self.exapmle_input_ids, self.example_logits_vector) + result = pipeline.single_logit_vector_to_token( + self.exapmle_input_ids, self.example_logits_vector) assert result in self.exapmle_input_ids assert isinstance(result, torch.Tensor) assert result.shape == torch.Size([1]) @@ -43,8 +45,10 @@ def test_single_logit_vector_to_token_valid_input(self): def test_logit_vectors_to_token_valid_input(self): generation_config = GenerationConfig() pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" assert all([r in [0, 1, 2] for r in result]) # Tests that single_logit_vector_to_token raises an error when given empty input_ids @@ -53,7 +57,8 @@ def test_single_logit_vector_to_token_empty_input_ids(self): pipeline = LogitVectorToTokenPipeLine(generation_config) input_ids = LongTensor([]).cuda() with pytest.raises(ValueError): - pipeline.single_logit_vector_to_token(input_ids, self.example_logits_vector) + pipeline.single_logit_vector_to_token(input_ids, + self.example_logits_vector) # Tests that single_logit_vector_to_token raises an error when given empty logits def test_single_logit_vector_to_token_empty_logits(self): @@ -61,7 +66,8 @@ def test_single_logit_vector_to_token_empty_logits(self): pipeline = LogitVectorToTokenPipeLine(generation_config) logits = FloatTensor([]).cuda() with pytest.raises(ValueError): - pipeline.single_logit_vector_to_token(self.exapmle_input_ids, logits) + pipeline.single_logit_vector_to_token(self.exapmle_input_ids, + logits) # Tests that logit_vectors_to_token raises an error when given empty input_ids def test_logit_vectors_to_token_empty_input_ids(self): @@ -69,7 +75,8 @@ def test_logit_vectors_to_token_empty_input_ids(self): pipeline = LogitVectorToTokenPipeLine(generation_config) input_ids = LongTensor([]).cuda() with pytest.raises(ValueError): - pipeline.logit_matrix_to_tokens(input_ids, self.example_logits_vectors) + pipeline.logit_matrix_to_tokens(input_ids, + self.example_logits_vectors) # Tests that logit_vectors_to_token raises an error when given empty logit_vectors def test_logit_vectors_to_token_empty_logit_vectors(self): @@ -77,69 +84,99 @@ def test_logit_vectors_to_token_empty_logit_vectors(self): pipeline = LogitVectorToTokenPipeLine(generation_config) logit_vectors = FloatTensor([]).cuda() with pytest.raises(ValueError): - pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, logit_vectors) + pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + logit_vectors) def test_logit_vectors_to_token_valid_input_do_sample(self): generation_config = GenerationConfig(do_sample=True) pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" assert all([r in [0, 1, 2] for r in result]) def test_logit_vectors_to_token_valid_input_do_sample_temperature(self): generation_config = GenerationConfig(do_sample=True, temperature=0.5) pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" assert all([r in [0, 1, 2] for r in result]) def test_logit_vectors_to_token_valid_input_do_sample_top_k(self): generation_config = GenerationConfig(do_sample=True, top_k=2) pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" assert all([r in [0, 1, 2] for r in result]) def test_logit_vectors_to_token_valid_input_do_sample_top_p(self): generation_config = GenerationConfig(do_sample=True, top_p=0.9) pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" assert all([r in [0, 1, 2] for r in result]) def test_logit_vectors_to_token_valid_input_do_sample_top_k_top_p(self): - generation_config = GenerationConfig(do_sample=True, top_k=2, top_p=0.9) + generation_config = GenerationConfig(do_sample=True, + top_k=2, + top_p=0.9) pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" assert all([r in [0, 1, 2] for r in result]) - def test_logit_vectors_to_token_valid_input_do_sample_top_k_top_p_temperature(self): - generation_config = GenerationConfig(do_sample=True, top_k=2, top_p=0.9, temperature=0.5) + def test_logit_vectors_to_token_valid_input_do_sample_top_k_top_p_temperature( + self): + generation_config = GenerationConfig(do_sample=True, + top_k=2, + top_p=0.9, + temperature=0.5) pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" assert all([r in [0, 1, 2] for r in result]) - def test_logit_vectors_to_token_valid_input_do_sample_top_k_top_p_temperature_repetition_penalty(self): - generation_config = GenerationConfig(do_sample=True, top_k=2, top_p=0.9, temperature=0.5, + def test_logit_vectors_to_token_valid_input_do_sample_top_k_top_p_temperature_repetition_penalty( + self, ): + generation_config = GenerationConfig(do_sample=True, + top_k=2, + top_p=0.9, + temperature=0.5, repetition_penalty=0.9) pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" assert all([r in [0, 1, 2] for r in result]) def test_logit_vectors_to_token_valid_input_do_sample_false(self): generation_config = GenerationConfig(do_sample=False) pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" - def test_logit_vectors_to_token_valid_input_do_sample_false_max_length(self): + def test_logit_vectors_to_token_valid_input_do_sample_false_max_length( + self): generation_config = GenerationConfig(do_sample=False, max_length=3) pipeline = LogitVectorToTokenPipeLine(generation_config) - result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, self.example_logits_vectors) - assert result.shape == (3,), f"Expected shape (3,), got {result.shape}. Result: {result}" + result = pipeline.logit_matrix_to_tokens(self.exapmle_input_ids, + self.example_logits_vectors) + assert result.shape == ( + 3, ), f"Expected shape (3,), got {result.shape}. Result: {result}" # Tests that logit_matrix_to_tokens returns valid token ids with valid input_ids and logit_vectors def test_logit_matrix_to_tokens_valid_input(self): @@ -148,24 +185,25 @@ def test_logit_matrix_to_tokens_valid_input(self): pipeline = LogitVectorToTokenPipeLine(GenerationConfig()) tokens = pipeline.logit_matrix_to_tokens(input_ids, logit_vectors) assert isinstance(tokens, Tensor) - assert tokens.shape == (2,) + assert tokens.shape == (2, ) assert tokens[0].dtype == long assert tokens[1].dtype == long # Tests that batch_to_tokens returns valid token ids with valid input_ids and batch def test_batch_to_tokens_valid_input(self): input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]]) - batch = [torch.tensor([[0.1, 0.2, 0.7], [0.3, 0.4, 0.3], [0.5, 0.2, 0.3]]), - torch.tensor([[0.1, 0.2, 0.7], [0.3, 0.4, 0.3], [0.5, 0.2, 0.3]])] + batch = [ + torch.tensor([[0.1, 0.2, 0.7], [0.3, 0.4, 0.3], [0.5, 0.2, 0.3]]), + torch.tensor([[0.1, 0.2, 0.7], [0.3, 0.4, 0.3], [0.5, 0.2, 0.3]]), + ] pipeline = LogitVectorToTokenPipeLine(GenerationConfig()) token_ids = pipeline.batch_to_tokens(input_ids, batch) assert isinstance(token_ids, list) assert len(token_ids) == 2 assert all(isinstance(tokens, Tensor) for tokens in token_ids) - assert all(token_ids[i].shape == (3,) for i in range(2)) - assert all(token_id.dtype == long for tokens in token_ids for token_id in tokens) - - + assert all(token_ids[i].shape == (3, ) for i in range(2)) + assert all(token_id.dtype == long for tokens in token_ids + for token_id in tokens) # Tests that logit_matrix_to_tokens raises a ValueError with empty logit_vectors def test_logit_matrix_to_tokens_empty_logit_vectors(self): @@ -193,8 +231,10 @@ def test_batch_to_tokens_invalid_input_ids(self): def test_batch_to_tokens_invalid_batch(self): input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]]) - batch = [[[0.1, 0.2, 0.7], [0.3, 0.4, 0.3], [0.5, 0.2, 0.3]], - [[0.1, 0.2], [0.3, 0.4], [0.5, 0.2]]] + batch = [ + [[0.1, 0.2, 0.7], [0.3, 0.4, 0.3], [0.5, 0.2, 0.3]], + [[0.1, 0.2], [0.3, 0.4], [0.5, 0.2]], + ] pipeline = LogitVectorToTokenPipeLine(GenerationConfig()) with pytest.raises(ValueError): pipeline.batch_to_tokens(input_ids, batch)