diff --git a/redisvl/utils/vectorize/voyageai.py b/redisvl/utils/vectorize/voyageai.py index e50fab7c..668d8253 100644 --- a/redisvl/utils/vectorize/voyageai.py +++ b/redisvl/utils/vectorize/voyageai.py @@ -1,4 +1,5 @@ import os +import warnings from typing import TYPE_CHECKING, Any from pydantic import ConfigDict @@ -13,6 +14,29 @@ # ignore that voyageai isn't imported # mypy: disable-error-code="name-defined" +# Sentinel used to detect when model is not explicitly passed to __init__ +_MODEL_NOT_SET = object() + +# Token limits for VoyageAI models (used for token-aware batching) +VOYAGE_TOTAL_TOKEN_LIMITS = { + "voyage-context-3": 32_000, + "voyage-3.5-lite": 1_000_000, + "voyage-3.5": 320_000, + "voyage-2": 320_000, + "voyage-3-large": 120_000, + "voyage-code-3": 120_000, + "voyage-large-2-instruct": 120_000, + "voyage-finance-2": 120_000, + "voyage-multilingual-2": 120_000, + "voyage-law-2": 120_000, + "voyage-large-2": 120_000, + "voyage-3": 120_000, + "voyage-3-lite": 120_000, + "voyage-code-2": 120_000, + "voyage-multimodal-3": 32_000, + "voyage-multimodal-3.5": 32_000, +} + class VoyageAIVectorizer(BaseVectorizer): """The VoyageAIVectorizer class utilizes VoyageAI's API to generate @@ -87,13 +111,28 @@ class VoyageAIVectorizer(BaseVectorizer): input_type="query" ) + # Using contextualized embeddings (voyage-context-3) + context_vectorizer = VoyageAIVectorizer( + model="voyage-context-3", + api_config={"api_key": "your-voyageai-api-key"} + ) + # Context models automatically use contextualized_embed API + context_embeddings = context_vectorizer.embed_many( + contents=["chunk 1", "chunk 2", "chunk 3"], + input_type="document" + ) + + # Token counting for API usage management + token_counts = vectorizer.count_tokens(["text one", "text two"]) + print(f"Token counts: {token_counts}") + """ model_config = ConfigDict(arbitrary_types_allowed=True) def __init__( self, - model: str = "voyage-3-large", + model: str = _MODEL_NOT_SET, # type: ignore[assignment] api_config: dict[str, Any] | None = None, dtype: str = "float32", cache: "EmbeddingsCache | None" = None, @@ -105,6 +144,8 @@ def __init__( Args: model (str): Model to use for embedding. Defaults to "voyage-3-large". + The default will be removed in the next major version; please specify + the model explicitly. api_config (Optional[Dict], optional): Dictionary containing the API key. Defaults to None. dtype (str): the default datatype to use when embedding content as byte arrays. @@ -122,6 +163,16 @@ def __init__( ffmpeg installed on the system. Image embeddings require pillow to be installed. """ + if model is _MODEL_NOT_SET: + warnings.warn( + "Instantiating VoyageAIVectorizer without an explicit 'model' " + "parameter is deprecated. The default ('voyage-3-large') will be " + "removed in the next major version. Please pass model='voyage-3-large' " + "(or your preferred model) explicitly.", + DeprecationWarning, + stacklevel=2, + ) + model = "voyage-3-large" super().__init__(model=model, dtype=dtype, cache=cache) # Initialize client and set up the model self._setup(api_config, **kwargs) @@ -318,7 +369,9 @@ def _embed_many( Args: contents: List of items to embed - each item must be one of str, PIL.Image.Image, or voyageai.video_utils.Video. Images and video require a multimodal model to be configured. - batch_size: Number of items to process in each API call + batch_size: Deprecated. Number of items to process in each API call. + Batch size is now determined automatically based on the model. + This parameter will be removed in the next major version. **kwargs: Additional parameters to pass to the VoyageAI API Returns: @@ -336,8 +389,17 @@ def _embed_many( # Validate inputs self._validate_input(contents, input_type, truncation) - # Determine batch size if not provided - if batch_size is None: + # Determine batch size - auto-determined based on model; explicit + # batch_size is deprecated. + if batch_size is not None: + warnings.warn( + "The 'batch_size' parameter is deprecated for VoyageAIVectorizer. " + "Batch size is now automatically determined based on the model's " + "token limits. This parameter will be removed in the next major version.", + DeprecationWarning, + stacklevel=2, + ) + else: batch_size = self._get_batch_size() try: @@ -393,7 +455,9 @@ async def _aembed_many( Args: contents: List of items to embed - each item must be one of str, PIL.Image.Image, or voyageai.video_utils.Video. Images and video require a multimodal model to be configured. - batch_size: Number of texts to process in each API call + batch_size: Deprecated. Number of texts to process in each API call. + Batch size is now determined automatically based on the model. + This parameter will be removed in the next major version. **kwargs: Additional parameters to pass to the VoyageAI API Returns: @@ -411,8 +475,17 @@ async def _aembed_many( # Validate inputs self._validate_input(contents, input_type, truncation) - # Determine batch size if not provided - if batch_size is None: + # Determine batch size - auto-determined based on model; explicit + # batch_size is deprecated. + if batch_size is not None: + warnings.warn( + "The 'batch_size' parameter is deprecated for VoyageAIVectorizer. " + "Batch size is now automatically determined based on the model's " + "token limits. This parameter will be removed in the next major version.", + DeprecationWarning, + stacklevel=2, + ) + else: batch_size = self._get_batch_size() try: @@ -448,6 +521,86 @@ def _serialize_for_cache(self, content: Any) -> bytes | str: return content.to_bytes() return super()._serialize_for_cache(content) + def _is_context_model(self) -> bool: + """ + Check if the current model is a contextualized embedding model. + + Contextualized models (like voyage-context-3) use a different API + endpoint and expect inputs formatted differently. + + Returns: + bool: True if the model is a context model, False otherwise. + """ + return "context" in self.model + + def count_tokens(self, texts: list[str]) -> list[int]: + """ + Count tokens for the given texts using VoyageAI's local tokenizer. + + This method runs entirely on the CPU using the HuggingFace ``tokenizers`` + library — it does NOT make any network/API calls. It is safe to call + frequently (e.g., for token-aware batching) without incurring API costs + or latency. + + Args: + texts: List of texts to count tokens for. + + Returns: + list[int]: List of token counts for each text. + + Raises: + ValueError: If tokenization fails. + + Example: + >>> vectorizer = VoyageAIVectorizer(model="voyage-3.5") + >>> token_counts = vectorizer.count_tokens(["Hello world", "Another text"]) + >>> print(token_counts) # [2, 2] + """ + if not texts: + return [] + + try: + # tokenize() is a local CPU operation using HuggingFace tokenizers, + # not a remote API call. + token_lists = self._client.tokenize(texts, model=self.model) + return [len(token_list) for token_list in token_lists] + except Exception as e: + raise ValueError(f"Token counting failed: {e}") + + async def acount_tokens(self, texts: list[str]) -> list[int]: + """ + Asynchronously count tokens for the given texts using VoyageAI's local tokenizer. + + This method runs entirely on the CPU using the HuggingFace ``tokenizers`` + library — it does NOT make any network/API calls. The underlying + tokenize operation is synchronous (CPU-bound), so this async wrapper + provides interface compatibility but does not yield to the event loop. + + Args: + texts: List of texts to count tokens for. + + Returns: + list[int]: List of token counts for each text. + + Raises: + ValueError: If tokenization fails. + + Example: + >>> vectorizer = VoyageAIVectorizer(model="voyage-3.5") + >>> token_counts = await vectorizer.acount_tokens(["Hello world", "Another text"]) + >>> print(token_counts) # [2, 2] + """ + if not texts: + return [] + + try: + # tokenize() is a local CPU operation (HuggingFace tokenizers), + # not a remote API call. Synchronous even on AsyncClient. + token_lists = self._aclient.tokenize(texts, model=self.model) + return [len(token_list) for token_list in token_lists] + except Exception as e: + raise ValueError(f"Token counting failed: {e}") + @property def type(self) -> str: return "voyageai" diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py index 7fcd993e..aa1db34a 100644 --- a/tests/integration/test_vectorizers.py +++ b/tests/integration/test_vectorizers.py @@ -646,3 +646,180 @@ def test_deprecated_text_parameter_warning(): embeddings = vectorizer.embed_many(texts=TEST_TEXTS) assert isinstance(embeddings, list) assert len(embeddings) == len(TEST_TEXTS) + + +# VoyageAI-specific tests for token counting and context model detection +@pytest.mark.requires_api_keys +def test_voyageai_count_tokens(): + """Test VoyageAI token counting functionality.""" + vectorizer = VoyageAIVectorizer(model="voyage-3.5") + texts = ["Hello world", "This is a longer test sentence."] + + token_counts = vectorizer.count_tokens(texts) + assert isinstance(token_counts, list) + assert len(token_counts) == len(texts) + assert all(isinstance(count, int) and count > 0 for count in token_counts) + + # Empty list should return empty list + assert vectorizer.count_tokens([]) == [] + + +@pytest.mark.requires_api_keys +@pytest.mark.asyncio +async def test_voyageai_acount_tokens(): + """Test VoyageAI async token counting functionality.""" + vectorizer = VoyageAIVectorizer(model="voyage-3.5") + texts = ["Hello world", "This is a longer test sentence."] + + token_counts = await vectorizer.acount_tokens(texts) + assert isinstance(token_counts, list) + assert len(token_counts) == len(texts) + assert all(isinstance(count, int) and count > 0 for count in token_counts) + + # Empty list should return empty list + assert await vectorizer.acount_tokens([]) == [] + + +def test_voyageai_token_limits(): + """Test VoyageAI token limit constants.""" + from redisvl.utils.vectorize.voyageai import VOYAGE_TOTAL_TOKEN_LIMITS + + # Verify token limits are defined correctly + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-context-3") == 32_000 + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-3.5-lite") == 1_000_000 + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-3.5") == 320_000 + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-multimodal-3") == 32_000 + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-multimodal-3.5") == 32_000 + + # Default for unknown models + assert VOYAGE_TOTAL_TOKEN_LIMITS.get("unknown-model", 120_000) == 120_000 + + +def test_voyageai_context_model_detection(): + """Test detection of contextualized embedding models.""" + # Test the context model detection logic directly + # The method checks if "context" is in the model name + assert "context" not in "voyage-3.5" + assert "context" in "voyage-context-3" + assert "context" not in "voyage-multimodal-3.5" + + # Verify the detection would work correctly for known models + test_cases = [ + ("voyage-3.5", False), + ("voyage-context-3", True), + ("voyage-multimodal-3.5", False), + ("voyage-3-large", False), + ] + for model_name, expected in test_cases: + # The _is_context_model method simply checks: "context" in self.model + assert ("context" in model_name) == expected, f"Failed for {model_name}" + + +@pytest.mark.requires_api_keys +def test_voyageai_multimodal_text_only(): + """Test VoyageAI multimodal vectorizer with text-only input.""" + vectorizer = VoyageAIVectorizer(model="voyage-multimodal-3") + + # Test single text embedding via embed() + embedding = vectorizer.embed("A red apple on a wooden table") + assert isinstance(embedding, list) + assert len(embedding) > 0 + assert all(isinstance(x, float) for x in embedding) + + # Test another text embedding to verify consistency + embedding2 = vectorizer.embed("A cat sleeping on a couch") + assert isinstance(embedding2, list) + assert len(embedding2) == len(embedding) + + +@pytest.mark.requires_api_keys +def test_voyageai_multimodal_image(): + """Test VoyageAI multimodal vectorizer with image input.""" + import os + import tempfile + + from PIL import Image + + vectorizer = VoyageAIVectorizer(model="voyage-multimodal-3") + + # Create a simple test image + img = Image.new("RGB", (100, 100), color="red") + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + img.save(f, format="PNG") + temp_path = f.name + + try: + # Test embed_image + embedding = vectorizer.embed_image(temp_path) + assert isinstance(embedding, list) + assert len(embedding) > 0 + assert all(isinstance(x, float) for x in embedding) + finally: + os.unlink(temp_path) + + +@pytest.mark.requires_api_keys +def test_voyageai_multimodal_video(): + """Test VoyageAI multimodal vectorizer with video input.""" + import os + import subprocess + import tempfile + + from PIL import Image + + vectorizer = VoyageAIVectorizer(model="voyage-multimodal-3.5") + + # Create a minimal test video using ffmpeg + with tempfile.TemporaryDirectory() as tmpdir: + # Create 3 frames + for i in range(3): + img = Image.new("RGB", (64, 64), color=(i * 80, 100, 150)) + img.save(os.path.join(tmpdir, f"frame_{i:03d}.png")) + + video_path = os.path.join(tmpdir, "test_video.mp4") + + # Create video from frames + result = subprocess.run( + [ + "ffmpeg", + "-y", + "-framerate", + "1", + "-i", + os.path.join(tmpdir, "frame_%03d.png"), + "-c:v", + "libx264", + "-pix_fmt", + "yuv420p", + "-t", + "3", + video_path, + ], + capture_output=True, + ) + + if result.returncode != 0: + pytest.skip("ffmpeg not available or failed to create test video") + + # Test embed_video + embedding = vectorizer.embed_video(video_path) + assert isinstance(embedding, list) + assert len(embedding) > 0 + assert all(isinstance(x, float) for x in embedding) + + +@pytest.mark.requires_api_keys +@pytest.mark.asyncio +async def test_voyageai_multimodal_async(): + """Test VoyageAI multimodal vectorizer async methods.""" + vectorizer = VoyageAIVectorizer(model="voyage-multimodal-3") + + # Test async text embedding + embedding = await vectorizer.aembed("A beautiful sunset over mountains") + assert isinstance(embedding, list) + assert len(embedding) > 0 + + # Test async batch + texts = ["Ocean waves", "Forest trees"] + embeddings = await vectorizer.aembed_many(texts) + assert len(embeddings) == 2