Skip to content

Commit

Permalink
Implemented Suport Portuguese | Add Google Translation
Browse files Browse the repository at this point in the history
  • Loading branch information
joaorura committed Oct 29, 2024
1 parent e7987e5 commit 53d670c
Show file tree
Hide file tree
Showing 4 changed files with 270 additions and 54 deletions.
66 changes: 59 additions & 7 deletions src/ragas/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,24 @@

import asyncio
import logging
import re
import nltk

import typing as t
from abc import ABC, abstractmethod
from collections import Counter
from dataclasses import dataclass, field
from enum import Enum

from pysbd.cleaner import Cleaner
from pysbd.utils import TextSpan
from pysbd import Segmenter

from ragas.callbacks import ChainType, new_group
from ragas.dataset_schema import MultiTurnSample, SingleTurnSample
from ragas.executor import is_event_loop_running
from ragas.prompt import PromptMixin
from ragas.run_config import RunConfig
from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES, deprecated
from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES, RAGAS_SUPPORTED_LANGUAGE_CODES_PYSBD, deprecated

if t.TYPE_CHECKING:
from langchain_core.callbacks import Callbacks
Expand Down Expand Up @@ -452,16 +456,21 @@ def get_segmenter(
"""
Get a sentence segmenter for a given language
"""

language = language.lower()
if language not in RAGAS_SUPPORTED_LANGUAGE_CODES:
raise ValueError(
f"Language '{language}' not supported. Supported languages: {RAGAS_SUPPORTED_LANGUAGE_CODES.keys()}"
)
return Segmenter(
language=RAGAS_SUPPORTED_LANGUAGE_CODES[language],
clean=clean,
char_span=char_span,
)

if language in RAGAS_SUPPORTED_LANGUAGE_CODES_PYSBD:
return Segmenter(
language=RAGAS_SUPPORTED_LANGUAGE_CODES_PYSBD[language],
clean=clean,
char_span=char_span,
)
else:
return NLTKSegmenter(language=language, char_span=char_span)


def is_reproducable(metric: Metric) -> bool:
Expand All @@ -472,3 +481,46 @@ def is_reproducable(metric: Metric) -> bool:


ensembler = Ensember()


class NLTKSegmenter:
def __init__(self, language: str = "english", char_span: bool = False, clean: bool = False):
self.language = language.lower()
self.char_span = char_span
self.clean = clean

def sentences_with_char_spans(self, sentences):
sent_spans = []
prior_end_char_idx = 0
for sent in sentences:
for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text):
match_str = match.group()
match_start_idx, match_end_idx = match.span()
if match_end_idx > prior_end_char_idx:
sent_spans.append(
TextSpan(match_str, match_start_idx, match_end_idx))
prior_end_char_idx = match_end_idx
break
return sent_spans

def cleaner(self, text):
return Cleaner(text, self.language_module)

def segment(self, text):
self.original_text = text
if not text:
return []

if self.clean:
text = self.cleaner(text).clean()

postprocessed_sents = nltk.tokenize.sent_tokenize(text, language=self.language)
sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
if self.char_span:
return sentence_w_char_spans
elif self.clean:
# clean and destructed sentences
return postprocessed_sents
else:
# nondestructive with whitespaces
return [textspan.sent for textspan in sentence_w_char_spans]
5 changes: 3 additions & 2 deletions src/ragas/prompt/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ def set_prompts(self, **prompts):
setattr(self, key, value)

async def adapt_prompts(
self, language: str, llm: BaseRagasLLM, adapt_instruction: bool = False
self, language: str, llm: BaseRagasLLM, adapt_instruction: bool = False,
google_translate: bool = False
) -> t.Dict[str, PydanticPrompt]:
"""
Adapts the prompts in the class to the given language and using the given LLM.
Expand All @@ -67,7 +68,7 @@ async def adapt_prompts(
prompts = self.get_prompts()
adapted_prompts = {}
for name, prompt in prompts.items():
adapted_prompt = await prompt.adapt(language, llm, adapt_instruction)
adapted_prompt = await prompt.adapt(language, llm, adapt_instruction, google_translate)
adapted_prompts[name] = adapted_prompt

return adapted_prompts
Expand Down
Loading

0 comments on commit 53d670c

Please sign in to comment.