-
Notifications
You must be signed in to change notification settings - Fork 142
Fix Multimodal Gemini Batch Request Creation #690
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import copy | ||
| import json | ||
| import mimetypes | ||
| import os | ||
| import typing as t | ||
| from functools import lru_cache | ||
|
|
@@ -228,8 +229,8 @@ def create_api_specific_request_batch(self, generic_request: GenericRequest) -> | |
| """Creates an API-specific request body from a generic request. | ||
|
|
||
| Transforms a GenericRequest into the format expected by Gemini's batch API. | ||
| Combines and constructs a system message with schema and instructions using | ||
| the instructor package for JSON response formatting. | ||
| Handles multi-modal inputs by parsing the dictionary structure created by the | ||
| curator framework, which combines text and images into a single message. | ||
|
|
||
| Args: | ||
| generic_request: The generic request object containing model, messages, | ||
|
|
@@ -239,17 +240,57 @@ def create_api_specific_request_batch(self, generic_request: GenericRequest) -> | |
| dict: API specific request body formatted for Gemini's batch API, | ||
| including custom_id and request parameters. | ||
| """ | ||
| contents = [] | ||
| parts = [] | ||
| # The prompt() function returns (text, image), which curator combines into | ||
| # a single message with a dictionary as its content. We need to parse this dict. | ||
| for message in generic_request.messages: | ||
| contents.append({"role": message["role"], "parts": [{"text": message["content"]}]}) | ||
| request_object = {"contents": contents} | ||
| content = message["content"] | ||
|
|
||
| # Primary Path: Handle the multimodal dictionary from curator | ||
| if isinstance(content, dict) and ("texts" in content or "images" in content): | ||
| # Add all text parts from the 'texts' list | ||
| for text_item in content.get("texts", []): | ||
| if isinstance(text_item, str): | ||
| parts.append({"text": text_item}) | ||
|
|
||
| # Add all image parts from the 'images' list | ||
| for image_item in content.get("images", []): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there are "files" as well |
||
| # image_item is a dict like {'url': '...', 'mime_type': '...'} | ||
| url = image_item.get("url") | ||
| if not url: | ||
| continue # Skip if there's no URL | ||
|
|
||
| mime_type = image_item.get("mime_type") | ||
| if not mime_type: | ||
| # Fallback to guessing the mime type if not provided | ||
| mime_type, _ = mimetypes.guess_type(url) | ||
| if not mime_type: | ||
| logger.warning(f"Could not determine MIME type for {url}. Defaulting to 'image/png'.") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. logger.debug There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it's a good idea to only log debug-level messages when the mime_type is missing, as this makes it hard for people to notice the problem—even though other API implementations might not log anything at all. I believe that at the very least a warning should be issued when mime_type is missing, or even an exception should be thrown. |
||
| mime_type = "image/png" | ||
|
|
||
| parts.append({"fileData": {"fileUri": url, "mimeType": mime_type}}) | ||
|
|
||
| # Fallback for simple text-only messages | ||
| elif isinstance(content, str): | ||
| parts.append({"text": content}) | ||
|
|
||
| # Fallback for any other unexpected type | ||
| else: | ||
| logger.warning(f"Unsupported message content type: {type(content)}. Converting to string.") | ||
| parts.append({"text": str(content)}) | ||
|
|
||
| # For multi-modal requests, Gemini expects a single 'contents' object | ||
| # with a 'parts' list. The role is typically 'user'. | ||
| request_object = {"contents": [{"role": "user", "parts": parts}]} | ||
|
|
||
| if generic_request.response_format: | ||
| request_object.update( | ||
| # Ensure generationConfig exists before trying to update it | ||
| if "generationConfig" not in request_object: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are creating |
||
| request_object["generationConfig"] = {} | ||
| request_object["generationConfig"].update( | ||
| { | ||
| "generationConfig": { | ||
| "responseMimeType": "application/json", | ||
| "responseSchema": _response_format_to_json(self.prompt_formatter.response_format), | ||
| } | ||
| "responseMimeType": "application/json", | ||
| "responseSchema": _response_format_to_json(self.prompt_formatter.response_format), | ||
| } | ||
| ) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do you need the "texts" or "images" check?