Pārlūkot izejas kodu

Tweak connect exceptions (#1120)

* Clean up manual sleep

* Add default retries and document them.

* Add doctrings to llm

* Add exponential backoff for rate limiting errors

* Get embeddings for the action and its own content, not the user message

* Add a few bad exceptions to stop loop

* Stop loop when the step has no action

* Add action with content, no message, to history

* make retry settings customizable

* fix condense to stop the loop for the same reasons as completion

* Add 500-504 exception to retries

* document the retry variables

* Add retries and limits for embeddings. Replaces llama-index hard-coded decorator.

* Rename to retry_min_wait and retry_max_wait
Engel Nyst 1 gadu atpakaļ
vecāks
revīzija
464bf7ee23

+ 5 - 0
README.md

@@ -177,6 +177,11 @@ can only be as powerful as the models driving it--fortunately folks on our team
 are actively working on building better open source models!
 
 
+**Note on API retries and rate limits:**
+Some LLMs have rate limits and may require retries. OpenDevin will automatically retry requests if it receives a 429 error or API connection error.
+You can set LLM_NUM_RETRIES, LLM_RETRY_MIN_WAIT, LLM_RETRY_MAX_WAIT environment variables to control the number of retries and the time between retries.
+By default, LLM_NUM_RETRIES is 5 and LLM_RETRY_MIN_WAIT, LLM_RETRY_MAX_WAIT are 3 seconds and respectively 60 seconds.
+
 ## ⭐️ Research Strategy
 
 Achieving full replication of production-grade applications with LLMs is a complex endeavor. Our strategy involves:

+ 36 - 0
agenthub/monologue_agent/utils/memory.py

@@ -1,3 +1,4 @@
+import llama_index.embeddings.openai.base as llama_openai
 from threading import Thread
 
 import chromadb
@@ -5,11 +6,46 @@ from llama_index.core import Document
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.core import VectorStoreIndex
 from llama_index.vector_stores.chroma import ChromaVectorStore
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
+from openai._exceptions import APIConnectionError, RateLimitError, InternalServerError
 
 from opendevin import config
 from opendevin.logger import opendevin_logger as logger
 from . import json
 
+num_retries = config.get('LLM_NUM_RETRIES')
+retry_min_wait = config.get('LLM_RETRY_MIN_WAIT')
+retry_max_wait = config.get('LLM_RETRY_MAX_WAIT')
+
+# llama-index includes a retry decorator around openai.get_embeddings() function
+# it is initialized with hard-coded values and errors
+# this non-customizable behavior is creating issues when it's retrying faster than providers' rate limits
+# this block attempts to banish it and replace it with our decorator, to allow users to set their own limits
+
+if hasattr(llama_openai.get_embeddings, '__wrapped__'):
+    original_get_embeddings = llama_openai.get_embeddings.__wrapped__
+else:
+    logger.warning('Cannot set custom retry limits.')  # warn
+    num_retries = 1
+    original_get_embeddings = llama_openai.get_embeddings
+
+
+def attempt_on_error(retry_state):
+    logger.error(f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.', exc_info=False)
+    return True
+
+
+@retry(reraise=True,
+       stop=stop_after_attempt(num_retries),
+       wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait),
+       retry=retry_if_exception_type((RateLimitError, APIConnectionError, InternalServerError)),
+       after=attempt_on_error)
+def wrapper_get_embeddings(*args, **kwargs):
+    return original_get_embeddings(*args, **kwargs)
+
+
+llama_openai.get_embeddings = wrapper_get_embeddings
+
 embedding_strategy = config.get('LLM_EMBEDDING_MODEL')
 
 # TODO: More embeddings: https://docs.llamaindex.ai/en/stable/examples/embeddings/OpenAI/

+ 7 - 5
agenthub/monologue_agent/utils/monologue.py

@@ -1,9 +1,9 @@
-import traceback
 
 from opendevin.llm.llm import LLM
 from opendevin.exceptions import AgentEventTypeError
 import agenthub.monologue_agent.utils.json as json
 import agenthub.monologue_agent.utils.prompts as prompts
+from opendevin.logger import opendevin_logger as logger
 
 
 class Monologue:
@@ -53,7 +53,7 @@ class Monologue:
             try:
                 total_length += len(json.dumps(t))
             except TypeError as e:
-                print(f'Error serializing thought: {e}')
+                logger.error('Error serializing thought: %s', str(e), exc_info=False)
         return total_length
 
     def condense(self, llm: LLM):
@@ -64,7 +64,7 @@ class Monologue:
         - llm (LLM): llm to be used for summarization
 
         Raises:
-        - RunTimeError: When the condensing process fails for any reason
+        - Exception: the same exception as it got from the llm or processing the response
         """
 
         try:
@@ -74,5 +74,7 @@ class Monologue:
             summary_resp = resp['choices'][0]['message']['content']
             self.thoughts = prompts.parse_summary_response(summary_resp)
         except Exception as e:
-            traceback.print_exc()
-            raise RuntimeError(f'Error condensing thoughts: {e}')
+            logger.error('Error condensing thoughts: %s', str(e), exc_info=False)
+
+            # TODO If the llm fails with ContextWindowExceededError, we can try to condense the monologue chunk by chunk
+            raise

+ 18 - 2
opendevin/config.py

@@ -5,6 +5,9 @@ import toml
 from dotenv import load_dotenv
 
 from opendevin.schema import ConfigType
+import logging
+
+logger = logging.getLogger(__name__)
 
 load_dotenv()
 
@@ -21,8 +24,9 @@ DEFAULT_CONFIG: dict = {
     ConfigType.LLM_EMBEDDING_MODEL: 'local',
     ConfigType.LLM_EMBEDDING_DEPLOYMENT_NAME: None,
     ConfigType.LLM_API_VERSION: None,
-    ConfigType.LLM_NUM_RETRIES: 1,
-    ConfigType.LLM_COOLDOWN_TIME: 1,
+    ConfigType.LLM_NUM_RETRIES: 5,
+    ConfigType.LLM_RETRY_MIN_WAIT: 3,
+    ConfigType.LLM_RETRY_MAX_WAIT: 60,
     ConfigType.MAX_ITERATIONS: 100,
     # GPT-4 pricing is $10 per 1M input tokens. Since tokenization happens on LLM side,
     # we cannot easily count number of tokens, but we can count characters.
@@ -41,6 +45,16 @@ if os.path.exists('config.toml'):
     with open('config.toml', 'rb') as f:
         config_str = f.read().decode('utf-8')
 
+
+def int_value(value, default, config_key):
+    # FIXME use a library
+    try:
+        return int(value)
+    except ValueError:
+        logger.warning(f'Invalid value for {config_key}: {value} not applied. Using default value {default}')
+        return default
+
+
 tomlConfig = toml.loads(config_str)
 config = DEFAULT_CONFIG.copy()
 for k, v in config.items():
@@ -48,6 +62,8 @@ for k, v in config.items():
         config[k] = os.environ[k]
     elif k in tomlConfig:
         config[k] = tomlConfig[k]
+    if k in [ConfigType.LLM_NUM_RETRIES, ConfigType.LLM_RETRY_MIN_WAIT, ConfigType.LLM_RETRY_MAX_WAIT]:
+        config[k] = int_value(config[k], v, config_key=k)
 
 
 def get_parser():

+ 8 - 12
opendevin/controller/agent_controller.py

@@ -1,10 +1,9 @@
 import asyncio
-import time
 import traceback
 from typing import Callable, List
 
-from litellm.exceptions import APIConnectionError
-from openai import AuthenticationError
+from openai import AuthenticationError, APIConnectionError
+from litellm import ContextWindowExceededError
 
 from opendevin import config
 from opendevin.action import (
@@ -170,26 +169,23 @@ class AgentController:
         observation: Observation = NullObservation('')
         try:
             action = self.agent.step(self.state)
-            logger.info(action, extra={'msg_type': 'ACTION'})
             if action is None:
                 raise AgentNoActionError()
+            logger.info(action, extra={'msg_type': 'ACTION'})
         except Exception as e:
             observation = AgentErrorObservation(str(e))
             logger.error(e)
             logger.debug(traceback.format_exc())
 
-            if isinstance(e, APIConnectionError):
-                time.sleep(3)
-
             # raise specific exceptions that need to be handled outside
-            # note: we are using AuthenticationError class from openai rather than
-            # litellm because:
+            # note: we are using classes from openai rather than litellm because:
             # 1) litellm.exceptions.AuthenticationError is a subclass of openai.AuthenticationError
-            # 2) embeddings call, initiated by llama-index, has no wrapper for authentication
-            #    errors. This means we have to catch individual authentication errors
+            # 2) embeddings call, initiated by llama-index, has no wrapper for errors.
+            #    This means we have to catch individual authentication errors
             #    from different providers, and OpenAI is one of these.
-            if isinstance(e, (AuthenticationError, AgentNoActionError)):
+            if isinstance(e, (AuthenticationError, ContextWindowExceededError, APIConnectionError)):
                 raise
+
         self.update_state_after_step()
 
         await self._run_callbacks(action)

+ 37 - 14
opendevin/llm/llm.py

@@ -1,30 +1,55 @@
 
 from litellm import completion as litellm_completion
-from tenacity import retry, retry_if_exception_type, stop_after_attempt
-from litellm.exceptions import APIConnectionError, RateLimitError
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
+from litellm.exceptions import APIConnectionError, RateLimitError, ServiceUnavailableError
 from functools import partial
 
 from opendevin import config
-from opendevin.logger import llm_prompt_logger, llm_response_logger, opendevin_logger
+from opendevin.logger import llm_prompt_logger, llm_response_logger
+from opendevin.logger import opendevin_logger as logger
+
 
 DEFAULT_API_KEY = config.get('LLM_API_KEY')
 DEFAULT_BASE_URL = config.get('LLM_BASE_URL')
 DEFAULT_MODEL_NAME = config.get('LLM_MODEL')
-DEFAULT_LLM_NUM_RETRIES = config.get('LLM_NUM_RETRIES')
-DEFAULT_LLM_COOLDOWN_TIME = config.get('LLM_COOLDOWN_TIME')
 DEFAULT_API_VERSION = config.get('LLM_API_VERSION')
+LLM_NUM_RETRIES = config.get('LLM_NUM_RETRIES')
+LLM_RETRY_MIN_WAIT = config.get('LLM_RETRY_MIN_WAIT')
+LLM_RETRY_MAX_WAIT = config.get('LLM_RETRY_MAX_WAIT')
 
 
 class LLM:
+    """
+    The LLM class represents a Language Model instance.
+    """
+
     def __init__(self,
                  model=DEFAULT_MODEL_NAME,
                  api_key=DEFAULT_API_KEY,
                  base_url=DEFAULT_BASE_URL,
-                 num_retries=DEFAULT_LLM_NUM_RETRIES,
-                 cooldown_time=DEFAULT_LLM_COOLDOWN_TIME,
                  api_version=DEFAULT_API_VERSION,
+                 num_retries=LLM_NUM_RETRIES,
+                 retry_min_wait=LLM_RETRY_MIN_WAIT,
+                 retry_max_wait=LLM_RETRY_MAX_WAIT,
                  ):
-        opendevin_logger.info(f'Initializing LLM with model: {model}')
+        """
+        Args:
+            model (str, optional): The name of the language model. Defaults to LLM_MODEL.
+            api_key (str, optional): The API key for accessing the language model. Defaults to LLM_API_KEY.
+            base_url (str, optional): The base URL for the language model API. Defaults to LLM_BASE_URL. Not necessary for OpenAI.
+            api_version (str, optional): The version of the API to use. Defaults to LLM_API_VERSION. Not necessary for OpenAI.
+            num_retries (int, optional): The number of retries for API calls. Defaults to LLM_NUM_RETRIES.
+            retry_min_wait (int, optional): The minimum time to wait between retries in seconds. Defaults to LLM_RETRY_MIN_TIME.
+            retry_max_wait (int, optional): The maximum time to wait between retries in seconds. Defaults to LLM_RETRY_MAX_TIME.
+
+        Attributes:
+            model_name (str): The name of the language model.
+            api_key (str): The API key for accessing the language model.
+            base_url (str): The base URL for the language model API.
+            api_version (str): The version of the API to use.
+            completion (function): A decorator for the litellm completion function.
+        """
+        logger.info(f'Initializing LLM with model: {model}')
         self.model_name = model
         self.api_key = api_key
         self.base_url = base_url
@@ -35,15 +60,13 @@ class LLM:
 
         completion_unwrapped = self._completion
 
-        def my_wait(retry_state):
-            seconds = (retry_state.attempt_number) * cooldown_time
-            opendevin_logger.warning(f'LLM error: {retry_state.outcome.exception()}')
-            opendevin_logger.info(f'Attempt #{retry_state.attempt_number} | Sleeping for {seconds}s')
-            return seconds
+        def attempt_on_error(retry_state):
+            logger.error(f'{retry_state.outcome.exception()}. Attempt #{retry_state.attempt_number} | You can customize these settings in the configuration.', exc_info=False)
+            return True
 
         @retry(reraise=True,
                stop=stop_after_attempt(num_retries),
-               wait=my_wait, retry=retry_if_exception_type((APIConnectionError, RateLimitError)))
+               wait=wait_random_exponential(min=retry_min_wait, max=retry_max_wait), retry=retry_if_exception_type((RateLimitError, APIConnectionError, ServiceUnavailableError)), after=attempt_on_error)
         def wrapper(*args, **kwargs):
             if 'messages' in kwargs:
                 messages = kwargs['messages']

+ 2 - 1
opendevin/schema/config.py

@@ -15,7 +15,8 @@ class ConfigType(str, Enum):
     LLM_EMBEDDING_DEPLOYMENT_NAME = 'LLM_EMBEDDING_DEPLOYMENT_NAME'
     LLM_API_VERSION = 'LLM_API_VERSION'
     LLM_NUM_RETRIES = 'LLM_NUM_RETRIES'
-    LLM_COOLDOWN_TIME = 'LLM_COOLDOWN_TIME'
+    LLM_RETRY_MIN_WAIT = 'LLM_RETRY_MIN_WAIT'
+    LLM_RETRY_MAX_WAIT = 'LLM_RETRY_MAX_WAIT'
     MAX_ITERATIONS = 'MAX_ITERATIONS'
     MAX_CHARS = 'MAX_CHARS'
     AGENT = 'AGENT'