Sfoglia il codice sorgente

feat: add metrics related to cost for better observability (#1944)

* add metrics for total_cost

* make lint

* refact codeact

* change metrics into llm

* add costs list, add into state

* refactor log completion

* refactor and test others

* make lint

* Update opendevin/core/metrics.py

Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>

* Update opendevin/llm/llm.py

Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>

* refactor

* add code

---------

Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
Yufan Song 1 anno fa
parent
commit
d18e6c85a0

+ 1 - 1
agenthub/SWE_agent/agent.py

@@ -42,7 +42,7 @@ class SWEAgent(Agent):
         self.running_memory.append(memory)
 
     def _think_act(self, messages: list[dict]) -> tuple[Action, str]:
-        resp = self.llm.completion(
+        resp = self.llm.do_completion(
             messages=messages,
             temperature=0.05,
         )

+ 1 - 17
agenthub/codeact_agent/codeact_agent.py

@@ -9,7 +9,6 @@ from agenthub.codeact_agent.prompt import (
 )
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events.action import (
     Action,
     AgentFinishAction,
@@ -173,7 +172,6 @@ class CodeActAgent(Agent):
         Resets the CodeAct Agent.
         """
         super().reset()
-        self.cost_accumulator = 0
 
     def step(self, state: State) -> Action:
         """
@@ -215,7 +213,7 @@ class CodeActAgent(Agent):
                 f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
             )
 
-        response = self.llm.completion(
+        response = self.llm.do_completion(
             messages=messages,
             stop=[
                 '</execute_ipython>',
@@ -225,8 +223,6 @@ class CodeActAgent(Agent):
             temperature=0.0,
         )
 
-        self.log_cost(response)
-
         action_str: str = parse_response(response)
         state.num_of_chars += sum(
             len(message['content']) for message in messages
@@ -269,15 +265,3 @@ class CodeActAgent(Agent):
 
     def search_memory(self, query: str) -> list[str]:
         raise NotImplementedError('Implement this abstract method')
-
-    def log_cost(self, response):
-        try:
-            cur_cost = self.llm.completion_cost(response)
-        except Exception:
-            cur_cost = 0
-        self.cost_accumulator += cur_cost
-        logger.info(
-            'Cost: %.2f USD | Accumulated Cost: %.2f USD',
-            cur_cost,
-            self.cost_accumulator,
-        )

+ 1 - 1
agenthub/micro/agent.py

@@ -65,7 +65,7 @@ class MicroAgent(Agent):
             latest_user_message=latest_user_message,
         )
         messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+        resp = self.llm.do_completion(messages=messages)
         action_resp = resp['choices'][0]['message']['content']
         state.num_of_chars += len(prompt) + len(action_resp)
         action = parse_response(action_resp)

+ 1 - 1
agenthub/monologue_agent/agent.py

@@ -242,7 +242,7 @@ class MonologueAgent(Agent):
             state.background_commands_obs,
         )
         messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+        resp = self.llm.do_completion(messages=messages)
         action_resp = resp['choices'][0]['message']['content']
         state.num_of_chars += len(prompt) + len(action_resp)
         action = prompts.parse_action_response(action_resp)

+ 1 - 1
agenthub/planner_agent/agent.py

@@ -43,7 +43,7 @@ class PlannerAgent(Agent):
             return AgentFinishAction()
         prompt = get_prompt(state)
         messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+        resp = self.llm.do_completion(messages=messages)
         action_resp = resp['choices'][0]['message']['content']
         state.num_of_chars += len(prompt) + len(action_resp)
         action = parse_response(action_resp)

+ 2 - 0
opendevin/controller/agent_controller.py

@@ -89,6 +89,8 @@ class AgentController:
 
     def update_state_after_step(self):
         self.state.updated_info = []
+        # update metrics especially for cost
+        self.state.metrics = self.agent.llm.metrics
 
     async def report_error(self, message: str, exception: Exception | None = None):
         self.state.error = message

+ 2 - 0
opendevin/controller/state/state.py

@@ -4,6 +4,7 @@ from dataclasses import dataclass, field
 
 from opendevin.controller.state.task import RootTask
 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.metrics import Metrics
 from opendevin.core.schema import AgentState
 from opendevin.events.action import (
     Action,
@@ -30,6 +31,7 @@ class State:
     outputs: dict = field(default_factory=dict)
     error: str | None = None
     agent_state: AgentState = AgentState.LOADING
+    metrics: Metrics = Metrics()
 
     def save_to_session(self, sid: str):
         fs = get_file_store()

+ 46 - 0
opendevin/core/metrics.py

@@ -0,0 +1,46 @@
+class Metrics:
+    """
+    Metrics class can record various metrics during running and evaluation.
+    Currently we define the following metrics:
+        accumulated_cost: the total cost (USD $) of the current LLM.
+    """
+
+    def __init__(self) -> None:
+        self._accumulated_cost: float = 0.0
+        self._costs: list[float] = []
+
+    @property
+    def accumulated_cost(self) -> float:
+        return self._accumulated_cost
+
+    @accumulated_cost.setter
+    def accumulated_cost(self, value: float) -> None:
+        if value < 0:
+            raise ValueError('Total cost cannot be negative.')
+        self._accumulated_cost = value
+
+    @property
+    def costs(self) -> list:
+        return self._costs
+
+    def add_cost(self, value: float) -> None:
+        if value < 0:
+            raise ValueError('Added cost cannot be negative.')
+        self._accumulated_cost += value
+        self._costs.append(value)
+
+    def get(self):
+        """
+        Return the metrics in a dictionary.
+        """
+        return {'accumulated_cost': self._accumulated_cost, 'costs': self._costs}
+
+    def log(self):
+        """
+        Log the metrics.
+        """
+        metrics = self.get()
+        logs = ''
+        for key, value in metrics.items():
+            logs += f'{key}: {value}\n'
+        return logs

+ 31 - 1
opendevin/llm/llm.py

@@ -21,6 +21,7 @@ from tenacity import (
 from opendevin.core.config import config
 from opendevin.core.logger import llm_prompt_logger, llm_response_logger
 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.metrics import Metrics
 
 __all__ = ['LLM']
 
@@ -58,6 +59,7 @@ class LLM:
         max_input_tokens=None,
         max_output_tokens=None,
         llm_config=None,
+        metrics=None,
     ):
         """
         Initializes the LLM. If LLMConfig is passed, its values will be the fallback.
@@ -77,7 +79,7 @@ class LLM:
             custom_llm_provider (str, optional): A custom LLM provider. Defaults to LLM_CUSTOM_LLM_PROVIDER.
             llm_timeout (int, optional): The maximum time to wait for a response in seconds. Defaults to LLM_TIMEOUT.
             llm_temperature (float, optional): The temperature for LLM sampling. Defaults to LLM_TEMPERATURE.
-
+            metrics (Metrics, optional): The metrics object to use. Defaults to None.
         """
         if llm_config is None:
             llm_config = config.llm
@@ -112,6 +114,7 @@ class LLM:
             if max_output_tokens is not None
             else llm_config.max_output_tokens
         )
+        metrics = metrics if metrics is not None else Metrics()
 
         logger.info(f'Initializing LLM with model: {model}')
         self.model_name = model
@@ -122,6 +125,7 @@ class LLM:
         self.max_output_tokens = max_output_tokens
         self.llm_timeout = llm_timeout
         self.custom_llm_provider = custom_llm_provider
+        self.metrics = metrics
 
         # litellm actually uses base Exception here for unknown model
         self.model_info = None
@@ -200,6 +204,30 @@ class LLM:
         """
         return self._completion
 
+    def do_completion(self, *args, **kwargs):
+        """
+        Wrapper for the litellm completion function.
+
+        Check the complete documentation at https://litellm.vercel.app/docs/completion
+        """
+        resp = self._completion(*args, **kwargs)
+        self.post_completion(resp)
+        return resp
+
+    def post_completion(self, response: str) -> None:
+        """
+        Post-process the completion response.
+        """
+        try:
+            cur_cost = self.completion_cost(response)
+        except Exception:
+            cur_cost = 0
+        logger.info(
+            'Cost: %.2f USD | Accumulated Cost: %.2f USD',
+            cur_cost,
+            self.metrics.accumulated_cost,
+        )
+
     def get_token_count(self, messages):
         """
         Get the number of tokens in a list of messages.
@@ -231,6 +259,7 @@ class LLM:
     def completion_cost(self, response):
         """
         Calculate the cost of a completion response based on the model.  Local models are treated as free.
+        Add the current cost into total cost in metrics.
 
         Args:
             response (list): A response from a model invocation.
@@ -241,6 +270,7 @@ class LLM:
         if not self.is_local():
             try:
                 cost = litellm_completion_cost(completion_response=response)
+                self.metrics.add_cost(cost)
                 return cost
             except Exception:
                 logger.warning('Cost calculation not supported for this model.')

+ 1 - 1
opendevin/memory/condenser.py

@@ -16,7 +16,7 @@ class MemoryCondenser:
 
         try:
             messages = [{'content': summarize_prompt, 'role': 'user'}]
-            resp = llm.completion(messages=messages)
+            resp = llm.do_completion(messages=messages)
             summary_response = resp['choices'][0]['message']['content']
             return summary_response
         except Exception as e:

+ 6 - 6
tests/unit/test_micro_agents.py

@@ -31,7 +31,7 @@ def test_coder_agent_with_summary():
     """
     mock_llm = MagicMock()
     content = json.dumps({'action': 'finish', 'args': {}})
-    mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]}
+    mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]}
 
     coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
     assert coder_agent is not None
@@ -43,8 +43,8 @@ def test_coder_agent_with_summary():
     state = State(history=history, inputs={'summary': summary})
     coder_agent.step(state)
 
-    mock_llm.completion.assert_called_once()
-    _, kwargs = mock_llm.completion.call_args
+    mock_llm.do_completion.assert_called_once()
+    _, kwargs = mock_llm.do_completion.call_args
     prompt = kwargs['messages'][0]['content']
     assert task in prompt
     assert "Here's a summary of the codebase, as it relates to this task" in prompt
@@ -58,7 +58,7 @@ def test_coder_agent_without_summary():
     """
     mock_llm = MagicMock()
     content = json.dumps({'action': 'finish', 'args': {}})
-    mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]}
+    mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]}
 
     coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
     assert coder_agent is not None
@@ -69,8 +69,8 @@ def test_coder_agent_without_summary():
     state = State(history=history)
     coder_agent.step(state)
 
-    mock_llm.completion.assert_called_once()
-    _, kwargs = mock_llm.completion.call_args
+    mock_llm.do_completion.assert_called_once()
+    _, kwargs = mock_llm.do_completion.call_args
     prompt = kwargs['messages'][0]['content']
     assert task in prompt
     assert "Here's a summary of the codebase, as it relates to this task" not in prompt