Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 160 additions & 7 deletions redisvl/utils/vectorize/voyageai.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import warnings
from typing import TYPE_CHECKING, Any

from pydantic import ConfigDict
Expand All @@ -13,6 +14,29 @@
# ignore that voyageai isn't imported
# mypy: disable-error-code="name-defined"

# Sentinel used to detect when model is not explicitly passed to __init__
_MODEL_NOT_SET = object()

# Token limits for VoyageAI models (used for token-aware batching)
VOYAGE_TOTAL_TOKEN_LIMITS = {
"voyage-context-3": 32_000,
"voyage-3.5-lite": 1_000_000,
"voyage-3.5": 320_000,
"voyage-2": 320_000,
"voyage-3-large": 120_000,
"voyage-code-3": 120_000,
"voyage-large-2-instruct": 120_000,
"voyage-finance-2": 120_000,
"voyage-multilingual-2": 120_000,
"voyage-law-2": 120_000,
"voyage-large-2": 120_000,
"voyage-3": 120_000,
"voyage-3-lite": 120_000,
"voyage-code-2": 120_000,
"voyage-multimodal-3": 32_000,
"voyage-multimodal-3.5": 32_000,
}


class VoyageAIVectorizer(BaseVectorizer):
"""The VoyageAIVectorizer class utilizes VoyageAI's API to generate
Expand Down Expand Up @@ -87,13 +111,28 @@ class VoyageAIVectorizer(BaseVectorizer):
input_type="query"
)

# Using contextualized embeddings (voyage-context-3)
context_vectorizer = VoyageAIVectorizer(
model="voyage-context-3",
api_config={"api_key": "your-voyageai-api-key"}
)
# Context models automatically use contextualized_embed API
context_embeddings = context_vectorizer.embed_many(
contents=["chunk 1", "chunk 2", "chunk 3"],
input_type="document"
)

# Token counting for API usage management
token_counts = vectorizer.count_tokens(["text one", "text two"])
print(f"Token counts: {token_counts}")

"""

model_config = ConfigDict(arbitrary_types_allowed=True)

def __init__(
self,
model: str = "voyage-3-large",
model: str = _MODEL_NOT_SET, # type: ignore[assignment]
api_config: dict[str, Any] | None = None,
dtype: str = "float32",
cache: "EmbeddingsCache | None" = None,
Expand All @@ -105,6 +144,8 @@ def __init__(

Args:
model (str): Model to use for embedding. Defaults to "voyage-3-large".
The default will be removed in the next major version; please specify
the model explicitly.
api_config (Optional[Dict], optional): Dictionary containing the API key.
Defaults to None.
dtype (str): the default datatype to use when embedding content as byte arrays.
Expand All @@ -122,6 +163,16 @@ def __init__(
ffmpeg installed on the system. Image embeddings require pillow to be installed.

"""
if model is _MODEL_NOT_SET:
warnings.warn(
"Instantiating VoyageAIVectorizer without an explicit 'model' "
"parameter is deprecated. The default ('voyage-3-large') will be "
"removed in the next major version. Please pass model='voyage-3-large' "
"(or your preferred model) explicitly.",
DeprecationWarning,
stacklevel=2,
)
model = "voyage-3-large"
super().__init__(model=model, dtype=dtype, cache=cache)
# Initialize client and set up the model
self._setup(api_config, **kwargs)
Expand Down Expand Up @@ -318,7 +369,9 @@ def _embed_many(
Args:
contents: List of items to embed - each item must be one of str, PIL.Image.Image, or
voyageai.video_utils.Video. Images and video require a multimodal model to be configured.
batch_size: Number of items to process in each API call
batch_size: Deprecated. Number of items to process in each API call.
Batch size is now determined automatically based on the model.
This parameter will be removed in the next major version.
**kwargs: Additional parameters to pass to the VoyageAI API

Returns:
Expand All @@ -336,8 +389,17 @@ def _embed_many(
# Validate inputs
self._validate_input(contents, input_type, truncation)

# Determine batch size if not provided
if batch_size is None:
# Determine batch size - auto-determined based on model; explicit
# batch_size is deprecated.
if batch_size is not None:
warnings.warn(
"The 'batch_size' parameter is deprecated for VoyageAIVectorizer. "
"Batch size is now automatically determined based on the model's "
"token limits. This parameter will be removed in the next major version.",
DeprecationWarning,
stacklevel=2,
)
else:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Auto-batching never activates due to base class default

High Severity

The batch_size parameter in _embed_many and _aembed_many will never be None when called through the public embed_many/aembed_many API. The base class BaseVectorizer.embed_many declares batch_size: int = 10 and always forwards it, so the if batch_size is not None check always evaluates to True. This means the deprecation warning fires on every call (even when the user didn't pass batch_size), and the model-specific auto-determined batch size from _get_batch_size() is never used.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 1e971a6. Configure here.

batch_size = self._get_batch_size()

try:
Expand Down Expand Up @@ -393,7 +455,9 @@ async def _aembed_many(
Args:
contents: List of items to embed - each item must be one of str, PIL.Image.Image, or
voyageai.video_utils.Video. Images and video require a multimodal model to be configured.
batch_size: Number of texts to process in each API call
batch_size: Deprecated. Number of texts to process in each API call.
Batch size is now determined automatically based on the model.
This parameter will be removed in the next major version.
**kwargs: Additional parameters to pass to the VoyageAI API

Returns:
Expand All @@ -411,8 +475,17 @@ async def _aembed_many(
# Validate inputs
self._validate_input(contents, input_type, truncation)

# Determine batch size if not provided
if batch_size is None:
# Determine batch size - auto-determined based on model; explicit
# batch_size is deprecated.
if batch_size is not None:
warnings.warn(
"The 'batch_size' parameter is deprecated for VoyageAIVectorizer. "
"Batch size is now automatically determined based on the model's "
"token limits. This parameter will be removed in the next major version.",
DeprecationWarning,
stacklevel=2,
)
else:
batch_size = self._get_batch_size()

try:
Expand Down Expand Up @@ -448,6 +521,86 @@ def _serialize_for_cache(self, content: Any) -> bytes | str:
return content.to_bytes()
return super()._serialize_for_cache(content)

def _is_context_model(self) -> bool:
"""
Check if the current model is a contextualized embedding model.

Contextualized models (like voyage-context-3) use a different API
endpoint and expect inputs formatted differently.

Returns:
bool: True if the model is a context model, False otherwise.
"""
return "context" in self.model
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unused method _is_context_model and constant VOYAGE_TOTAL_TOKEN_LIMITS

Medium Severity

_is_context_model() is defined but never called anywhere in production code — the _setup method doesn't route context models to contextualized_embed. Similarly, VOYAGE_TOTAL_TOKEN_LIMITS is defined but never referenced in production code (only in tests). The docstring advertises that "Context models automatically use contextualized_embed API," but this behavior is not implemented, making the documentation misleading.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 1e971a6. Configure here.


def count_tokens(self, texts: list[str]) -> list[int]:
"""
Count tokens for the given texts using VoyageAI's local tokenizer.

This method runs entirely on the CPU using the HuggingFace ``tokenizers``
library — it does NOT make any network/API calls. It is safe to call
frequently (e.g., for token-aware batching) without incurring API costs
or latency.

Args:
texts: List of texts to count tokens for.

Returns:
list[int]: List of token counts for each text.

Raises:
ValueError: If tokenization fails.

Example:
>>> vectorizer = VoyageAIVectorizer(model="voyage-3.5")
>>> token_counts = vectorizer.count_tokens(["Hello world", "Another text"])
>>> print(token_counts) # [2, 2]
"""
if not texts:
return []

try:
# tokenize() is a local CPU operation using HuggingFace tokenizers,
# not a remote API call.
token_lists = self._client.tokenize(texts, model=self.model)
return [len(token_list) for token_list in token_lists]
except Exception as e:
raise ValueError(f"Token counting failed: {e}")

async def acount_tokens(self, texts: list[str]) -> list[int]:
"""
Asynchronously count tokens for the given texts using VoyageAI's local tokenizer.

This method runs entirely on the CPU using the HuggingFace ``tokenizers``
library — it does NOT make any network/API calls. The underlying
tokenize operation is synchronous (CPU-bound), so this async wrapper
provides interface compatibility but does not yield to the event loop.

Args:
texts: List of texts to count tokens for.

Returns:
list[int]: List of token counts for each text.

Raises:
ValueError: If tokenization fails.

Example:
>>> vectorizer = VoyageAIVectorizer(model="voyage-3.5")
>>> token_counts = await vectorizer.acount_tokens(["Hello world", "Another text"])
>>> print(token_counts) # [2, 2]
"""
if not texts:
return []

try:
# tokenize() is a local CPU operation (HuggingFace tokenizers),
# not a remote API call. Synchronous even on AsyncClient.
token_lists = self._aclient.tokenize(texts, model=self.model)
return [len(token_list) for token_list in token_lists]
except Exception as e:
raise ValueError(f"Token counting failed: {e}")

@property
def type(self) -> str:
return "voyageai"
Loading
Loading