From e63b164b7974eeca1841a60b00e2bc3b9c8d7ca7 Mon Sep 17 00:00:00 2001 From: Yifeng Lu Date: Sun, 8 Dec 2024 15:09:31 -0800 Subject: [PATCH] Add 'video/quicktime' into supported video type. Also refactor and unify Gemini's supported modalities between VertexAI and GenAI APIs. PiperOrigin-RevId: 704075628 --- langfun/core/llms/google_genai.py | 70 ++++++++++++------------------- langfun/core/llms/vertexai.py | 19 +++++---- 2 files changed, 36 insertions(+), 53 deletions(-) diff --git a/langfun/core/llms/google_genai.py b/langfun/core/llms/google_genai.py index 2d24ec6..2ea0192 100644 --- a/langfun/core/llms/google_genai.py +++ b/langfun/core/llms/google_genai.py @@ -20,6 +20,7 @@ import langfun.core as lf from langfun.core import modalities as lf_modalities +from langfun.core.llms import vertexai import pyglove as pg @@ -307,71 +308,52 @@ def get( # -_IMAGE_TYPES = [ - 'image/png', - 'image/jpeg', - 'image/webp', - 'image/heic', - 'image/heif', -] - -_AUDIO_TYPES = [ - 'audio/aac', - 'audio/flac', - 'audio/mp3', - 'audio/m4a', - 'audio/mpeg', - 'audio/mpga', - 'audio/mp4', - 'audio/opus', - 'audio/pcm', - 'audio/wav', - 'audio/webm' -] - -_VIDEO_TYPES = [ - 'video/mov', - 'video/mpeg', - 'video/mpegps', - 'video/mpg', - 'video/mp4', - 'video/webm', - 'video/wmv', - 'video/x-flv', - 'video/3gpp', -] - -_PDF = [ - 'application/pdf', -] - - class GeminiExp_20241206(GenAI): # pylint: disable=invalid-name """Gemini Experimental model launched on 12/06/2024.""" model = 'gemini-exp-1206' - supported_modalities = _PDF + _IMAGE_TYPES + _AUDIO_TYPES + _VIDEO_TYPES + supported_modalities = ( + vertexai.DOCUMENT_TYPES + + vertexai.IMAGE_TYPES + + vertexai.AUDIO_TYPES + + vertexai.VIDEO_TYPES + ) class GeminiExp_20241114(GenAI): # pylint: disable=invalid-name """Gemini Experimental model launched on 11/14/2024.""" model = 'gemini-exp-1114' - supported_modalities = _PDF + _IMAGE_TYPES + _AUDIO_TYPES + _VIDEO_TYPES + supported_modalities = ( + vertexai.DOCUMENT_TYPES + + vertexai.IMAGE_TYPES + + vertexai.AUDIO_TYPES + + vertexai.VIDEO_TYPES + ) class GeminiPro1_5(GenAI): # pylint: disable=invalid-name """Gemini Pro latest model.""" model = 'gemini-1.5-pro-latest' - supported_modalities = _PDF + _IMAGE_TYPES + _AUDIO_TYPES + _VIDEO_TYPES + supported_modalities = ( + vertexai.DOCUMENT_TYPES + + vertexai.IMAGE_TYPES + + vertexai.AUDIO_TYPES + + vertexai.VIDEO_TYPES + ) class GeminiFlash1_5(GenAI): # pylint: disable=invalid-name """Gemini Flash latest model.""" model = 'gemini-1.5-flash-latest' - supported_modalities = _PDF + _IMAGE_TYPES + _AUDIO_TYPES + _VIDEO_TYPES + supported_modalities = ( + vertexai.DOCUMENT_TYPES + + vertexai.IMAGE_TYPES + + vertexai.AUDIO_TYPES + + vertexai.VIDEO_TYPES + ) class GeminiPro(GenAI): @@ -384,7 +366,7 @@ class GeminiProVision(GenAI): """Gemini Pro vision model.""" model = 'gemini-pro-vision' - supported_modalities = _IMAGE_TYPES + _VIDEO_TYPES + supported_modalities = vertexai.IMAGE_TYPES + vertexai.VIDEO_TYPES class Palm2(GenAI): diff --git a/langfun/core/llms/vertexai.py b/langfun/core/llms/vertexai.py index 7344d6a..8b3ae28 100644 --- a/langfun/core/llms/vertexai.py +++ b/langfun/core/llms/vertexai.py @@ -343,7 +343,7 @@ def _message_from_content_parts( return lf.AIMessage.from_chunks(chunks) -_IMAGE_TYPES = [ +IMAGE_TYPES = [ 'image/png', 'image/jpeg', 'image/webp', @@ -351,7 +351,7 @@ def _message_from_content_parts( 'image/heif', ] -_AUDIO_TYPES = [ +AUDIO_TYPES = [ 'audio/aac', 'audio/flac', 'audio/mp3', @@ -362,10 +362,10 @@ def _message_from_content_parts( 'audio/opus', 'audio/pcm', 'audio/wav', - 'audio/webm' + 'audio/webm', ] -_VIDEO_TYPES = [ +VIDEO_TYPES = [ 'video/mov', 'video/mpeg', 'video/mpegps', @@ -375,9 +375,10 @@ def _message_from_content_parts( 'video/wmv', 'video/x-flv', 'video/3gpp', + 'video/quicktime', ] -_DOCUMENT_TYPES = [ +DOCUMENT_TYPES = [ 'application/pdf', 'text/plain', 'text/csv', @@ -391,8 +392,8 @@ def _message_from_content_parts( class VertexAIGemini1_5(VertexAI): # pylint: disable=invalid-name """Vertex AI Gemini 1.5 model.""" - supported_modalities: pg.typing.List(str).freeze( # pytype: disable=invalid-annotation - _DOCUMENT_TYPES + _IMAGE_TYPES + _AUDIO_TYPES + _VIDEO_TYPES + supported_modalities: pg.typing.List(str).freeze( # pytype: disable=invalid-annotation + DOCUMENT_TYPES + IMAGE_TYPES + AUDIO_TYPES + VIDEO_TYPES ) @@ -460,8 +461,8 @@ class VertexAIGeminiPro1Vision(VertexAI): # pylint: disable=invalid-name """Vertex AI Gemini 1.0 Pro Vision model.""" model = 'gemini-1.0-pro-vision' - supported_modalities: pg.typing.List(str).freeze( # pytype: disable=invalid-annotation - _IMAGE_TYPES + _VIDEO_TYPES + supported_modalities: pg.typing.List(str).freeze( # pytype: disable=invalid-annotation + IMAGE_TYPES + VIDEO_TYPES )