Add cache to async LM call (#8135)

chenmoneygithub · web-flow · commit 576f33b55db9 · 2025-04-30T06:49:27.000-07:00
* add cache for async lm calls

* Cache async

* fix tests
diff --git a/dspy/clients/cache.py b/dspy/clients/cache.py
@@ -1,4 +1,5 @@
 import copy
+import inspect
 import logging
 import threading
 from functools import wraps
@@ -119,14 +120,20 @@ def get(self, request: Dict[str, Any], ignored_args_for_cache_key: Optional[list
             response.usage = {}
         return response
 
-    def put(self, request: Dict[str, Any], value: Any, ignored_args_for_cache_key: Optional[list[str]] = None) -> None:
+    def put(
+        self,
+        request: Dict[str, Any],
+        value: Any,
+        ignored_args_for_cache_key: Optional[list[str]] = None,
+        enable_memory_cache: bool = True,
+    ) -> None:
         try:
             key = self.cache_key(request, ignored_args_for_cache_key)
         except Exception:
             logger.debug(f"Failed to generate cache key for request: {request}")
             return
 
-        if self.enable_memory_cache:
+        if self.enable_memory_cache and enable_memory_cache:
             with self._lock:
                 self.memory_cache[key] = value
 
@@ -164,6 +171,7 @@ def load_memory_cache(self, filepath: str) -> None:
 def request_cache(
     cache_arg_name: Optional[str] = None,
     ignored_args_for_cache_key: Optional[list[str]] = ["api_key", "api_base", "base_url"],
+    enable_memory_cache: bool = True,
     *,  # everything after this is keyword-only
     maxsize: Optional[int] = None,  # legacy / no-op
 ):
@@ -174,6 +182,8 @@ def request_cache(
         cache_arg_name: The name of the argument that contains the request. If not provided, the entire kwargs is used
             as the request.
         ignored_args_for_cache_key: A list of arguments to ignore when computing the cache key from the request.
+        enable_memory_cache: Whether to enable in-memory cache at call time. If False, the memory cache will not be
+            written to on new data.
     """
 
     # Deprecation notice
@@ -186,10 +196,7 @@ def request_cache(
 
     def decorator(fn):
         @wraps(fn)
-        def wrapper(*args, **kwargs):
-            import dspy
-
-            cache = dspy.cache
+        def process_request(args, kwargs):
             # Use fully qualified function name for uniqueness
             fn_identifier = f"{fn.__module__}.{fn.__qualname__}"
 
@@ -206,6 +213,15 @@ def wrapper(*args, **kwargs):
                     modified_request[f"positional_arg_{i}"] = arg
             modified_request["_fn_identifier"] = fn_identifier
 
+            return modified_request
+
+        @wraps(fn)
+        def sync_wrapper(*args, **kwargs):
+            import dspy
+
+            cache = dspy.cache
+            modified_request = process_request(args, kwargs)
+
             # Retrieve from cache if available
             cached_result = cache.get(modified_request, ignored_args_for_cache_key)
 
@@ -214,10 +230,32 @@ def wrapper(*args, **kwargs):
 
             # Otherwise, compute and store the result
             result = fn(*args, **kwargs)
-            cache.put(modified_request, result, ignored_args_for_cache_key)
+            # `enable_memory_cache` can be provided at call time to avoid indefinite growth.
+            cache.put(modified_request, result, ignored_args_for_cache_key, enable_memory_cache)
+
+            return result
+
+        @wraps(fn)
+        async def async_wrapper(*args, **kwargs):
+            import dspy
+
+            cache = dspy.cache
+            modified_request = process_request(args, kwargs)
+
+            # Retrieve from cache if available
+            cached_result = cache.get(modified_request, ignored_args_for_cache_key)
+            if cached_result is not None:
+                return cached_result
+
+            # Otherwise, compute and store the result
+            result = await fn(*args, **kwargs)
+            cache.put(modified_request, result, ignored_args_for_cache_key, enable_memory_cache)
 
             return result
 
-        return wrapper
+        if inspect.iscoroutinefunction(fn):
+            return async_wrapper
+        else:
+            return sync_wrapper
 
     return decorator
diff --git a/dspy/clients/lm.py b/dspy/clients/lm.py
@@ -54,7 +54,7 @@ def __init__(
             max_tokens: The maximum number of tokens to generate per response.
             cache: Whether to cache the model responses for reuse to improve performance
                    and reduce costs.
-            cache_in_memory: To enable additional caching with LRU in memory.
+            cache_in_memory (deprecated): To enable additional caching with LRU in memory.
             callbacks: A list of callback functions to run before and after each request.
             num_retries: The number of times to retry a request if it fails transiently due to
                          network error, rate limiting, etc. Requests are retried with exponential
@@ -92,44 +92,69 @@ def __init__(
         else:
             self.kwargs = dict(temperature=temperature, max_tokens=max_tokens, **kwargs)
 
+    def _get_cached_completion_fn(self, completion_fn, cache, enable_memory_cache):
+        ignored_args_for_cache_key = ["api_key", "api_base", "base_url"]
+        if cache and enable_memory_cache:
+            completion_fn = request_cache(
+                cache_arg_name="request",
+                ignored_args_for_cache_key=ignored_args_for_cache_key,
+            )(completion_fn)
+        elif cache:
+            completion_fn = request_cache(
+                cache_arg_name="request",
+                ignored_args_for_cache_key=ignored_args_for_cache_key,
+                enable_memory_cache=False,
+            )(completion_fn)
+        else:
+            completion_fn = completion_fn
+
+        if not cache or litellm.cache is None:
+            litellm_cache_args = {"no-cache": True, "no-store": True}
+        else:
+            litellm_cache_args = {"no-cache": False, "no-store": False}
+
+        return completion_fn, litellm_cache_args
+
     def forward(self, prompt=None, messages=None, **kwargs):
         # Build the request.
         cache = kwargs.pop("cache", self.cache)
-        # disable cache will also disable in memory cache
-        cache_in_memory = cache and kwargs.pop("cache_in_memory", self.cache_in_memory)
+        enable_memory_cache = kwargs.pop("cache_in_memory", self.cache_in_memory)
+
         messages = messages or [{"role": "user", "content": prompt}]
         kwargs = {**self.kwargs, **kwargs}
 
-        # Make the request and handle LRU & disk caching.
-        if cache_in_memory:
-            completion = cached_litellm_completion if self.model_type == "chat" else cached_litellm_text_completion
-
-            results = completion(
-                request=dict(model=self.model, messages=messages, **kwargs),
-                num_retries=self.num_retries,
-            )
-        else:
-            completion = litellm_completion if self.model_type == "chat" else litellm_text_completion
+        completion = litellm_completion if self.model_type == "chat" else litellm_text_completion
+        completion, litellm_cache_args = self._get_cached_completion_fn(completion, cache, enable_memory_cache)
 
-            results = completion(
-                request=dict(model=self.model, messages=messages, **kwargs),
-                num_retries=self.num_retries,
-                # only leverage LiteLLM cache in this case
-                cache={"no-cache": not cache, "no-store": not cache},
-            )
+        results = completion(
+            request=dict(model=self.model, messages=messages, **kwargs),
+            num_retries=self.num_retries,
+            cache=litellm_cache_args,
+        )
 
         if not getattr(results, "cache_hit", False) and dspy.settings.usage_tracker and hasattr(results, "usage"):
             settings.usage_tracker.add_usage(self.model, dict(results.usage))
         return results
 
     async def aforward(self, prompt=None, messages=None, **kwargs):
-        completion = alitellm_completion if self.model_type == "chat" else alitellm_text_completion
+        # Build the request.
+        cache = kwargs.pop("cache", self.cache)
+        enable_memory_cache = kwargs.pop("cache_in_memory", self.cache_in_memory)
 
         messages = messages or [{"role": "user", "content": prompt}]
+        kwargs = {**self.kwargs, **kwargs}
+
+        completion = alitellm_completion if self.model_type == "chat" else alitellm_text_completion
+        completion, litellm_cache_args = self._get_cached_completion_fn(completion, cache, enable_memory_cache)
+
         results = await completion(
             request=dict(model=self.model, messages=messages, **kwargs),
             num_retries=self.num_retries,
+            cache=litellm_cache_args,
         )
+
+        if not getattr(results, "cache_hit", False) and dspy.settings.usage_tracker and hasattr(results, "usage"):
+            settings.usage_tracker.add_usage(self.model, dict(results.usage))
         return results
 
     def launch(self, launch_kwargs: Optional[Dict[str, Any]] = None):
@@ -206,22 +231,6 @@ def dump_state(self):
         return {key: getattr(self, key) for key in state_keys} | self.kwargs
 
 
-@request_cache(cache_arg_name="request", ignored_args_for_cache_key=["api_key", "api_base", "base_url"])
-def cached_litellm_completion(request: Dict[str, Any], num_retries: int):
-    import litellm
-
-    if litellm.cache:
-        litellm_cache_args = {"no-cache": False, "no-store": False}
-    else:
-        litellm_cache_args = {"no-cache": True, "no-store": True}
-
-    return litellm_completion(
-        request,
-        cache=litellm_cache_args,
-        num_retries=num_retries,
-    )
-
-
 def litellm_completion(request: Dict[str, Any], num_retries: int, cache={"no-cache": True, "no-store": True}):
     retry_kwargs = dict(
         retry_policy=_get_litellm_retry_policy(num_retries),
@@ -267,22 +276,6 @@ async def stream_completion():
     return stream_completion()
 
 
-@request_cache(cache_arg_name="request", ignored_args_for_cache_key=["api_key", "api_base", "base_url"])
-def cached_litellm_text_completion(request: Dict[str, Any], num_retries: int):
-    import litellm
-
-    if litellm.cache:
-        litellm_cache_args = {"no-cache": False, "no-store": False}
-    else:
-        litellm_cache_args = {"no-cache": True, "no-store": True}
-
-    return litellm_text_completion(
-        request,
-        num_retries=num_retries,
-        cache=litellm_cache_args,
-    )
-
-
 def litellm_text_completion(request: Dict[str, Any], num_retries: int, cache={"no-cache": True, "no-store": True}):
     # Extract the provider and model from the model string.
     # TODO: Not all the models are in the format of "provider/model"
diff --git a/tests/clients/test_cache.py b/tests/clients/test_cache.py
@@ -254,3 +254,31 @@ def test_function2(prompt, model):
 
         # Because model arg is not ignored, the second call should return a different result
         assert result3 != result4
+
+
+@pytest.mark.asyncio
+async def test_request_cache_decorator_async(cache):
+    """Test the request_cache decorator with async functions."""
+    from dspy.clients.cache import request_cache
+
+    # Mock the dspy.cache attribute
+    with patch("dspy.cache", cache):
+        # Define a test function
+        @request_cache()
+        async def test_function(prompt, model):
+            return f"Response for {prompt} with {model}"
+
+        # First call should compute the result
+        result1 = await test_function(prompt="Hello", model="openai/gpt-4o-mini")
+        assert result1 == "Response for Hello with openai/gpt-4o-mini"
+
+        # Second call with same arguments should use cache
+        with patch.object(cache, "get") as mock_get:
+            mock_get.return_value = "Cached response"
+            result2 = await test_function(prompt="Hello", model="openai/gpt-4o-mini")
+            assert result2 == "Cached response"
+            mock_get.assert_called_once()
+
+        # Call with different arguments should compute again
+        result3 = await test_function(prompt="Different", model="openai/gpt-4o-mini")
+        assert result3 == "Response for Different with openai/gpt-4o-mini"
diff --git a/tests/clients/test_lm.py b/tests/clients/test_lm.py
@@ -377,3 +377,43 @@ async def test_async_lm_call():
 
         assert result == ["answer"]
         mock_acompletion.assert_called_once()
+
+
+@pytest.mark.asyncio
+async def test_async_lm_call_with_cache(tmp_path):
+    """Test the async LM call with caching."""
+    original_cache = dspy.cache
+    dspy.clients.configure_cache(
+        enable_disk_cache=True,
+        enable_memory_cache=True,
+        enable_litellm_cache=False,
+        disk_cache_dir=tmp_path / ".disk_cache",
+    )
+    cache = dspy.cache
+
+    lm = dspy.LM(model="openai/gpt-4o-mini")
+
+    with mock.patch("dspy.clients.lm.alitellm_completion") as mock_alitellm_completion:
+        mock_alitellm_completion.return_value = ModelResponse(
+            choices=[Choices(message=Message(content="answer"))], model="openai/gpt-4o-mini"
+        )
+        mock_alitellm_completion.__qualname__ = "alitellm_completion"
+        await lm.acall("Query")
+
+        assert len(cache.memory_cache) == 1
+        cache_key = next(iter(cache.memory_cache.keys()))
+        assert cache_key in cache.disk_cache
+        assert mock_alitellm_completion.call_count == 1
+
+        await lm.acall("Query")
+        # Second call should hit the cache, so no new call to LiteLLM is made.
+        assert mock_alitellm_completion.call_count == 1
+
+        # Test that explicitly disabling memory cache works
+        await lm.acall("New query", cache_in_memory=False)
+
+        # There should be a new call to LiteLLM on new query, but the memory cache shouldn't be written to.
+        assert len(cache.memory_cache) == 1
+        assert mock_alitellm_completion.call_count == 2
+
+    dspy.cache = original_cache