Browse Source

[Runtime] Audit HTTP Retry timeouts (#4282)

tofarr 1 year ago
parent
commit
5097c4fe71

+ 16 - 7
openhands/runtime/builder/remote.py

@@ -7,7 +7,7 @@ import requests
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.runtime.builder import RuntimeBuilder
-from openhands.runtime.utils.request import send_request
+from openhands.runtime.utils.request import send_request_with_retry
 from openhands.runtime.utils.shutdown_listener import (
     should_continue,
     sleep_if_should_continue,
@@ -44,9 +44,13 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
         for tag in tags[1:]:
             files.append(('tags', (None, tag)))
 
-        # Send the POST request to /build
-        response = send_request(
-            self.session, 'POST', f'{self.api_url}/build', files=files
+        # Send the POST request to /build (Begins the build process)
+        response = send_request_with_retry(
+            self.session,
+            'POST',
+            f'{self.api_url}/build',
+            files=files,
+            timeout=30,
         )
 
         if response.status_code != 202:
@@ -65,11 +69,12 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
                 logger.error('Build timed out after 30 minutes')
                 raise RuntimeError('Build timed out after 30 minutes')
 
-            status_response = send_request(
+            status_response = send_request_with_retry(
                 self.session,
                 'GET',
                 f'{self.api_url}/build_status',
                 params={'build_id': build_id},
+                timeout=30,
             )
 
             if status_response.status_code != 200:
@@ -106,8 +111,12 @@ class RemoteRuntimeBuilder(RuntimeBuilder):
     def image_exists(self, image_name: str, pull_from_repo: bool = True) -> bool:
         """Checks if an image exists in the remote registry using the /image_exists endpoint."""
         params = {'image': image_name}
-        response = send_request(
-            self.session, 'GET', f'{self.api_url}/image_exists', params=params
+        response = send_request_with_retry(
+            self.session,
+            'GET',
+            f'{self.api_url}/image_exists',
+            params=params,
+            timeout=30,
         )
 
         if response.status_code != 200:

+ 8 - 7
openhands/runtime/client/runtime.py

@@ -35,9 +35,7 @@ from openhands.runtime.builder import DockerRuntimeBuilder
 from openhands.runtime.plugins import PluginRequirement
 from openhands.runtime.runtime import Runtime
 from openhands.runtime.utils import find_available_tcp_port
-from openhands.runtime.utils.request import (
-    send_request,
-)
+from openhands.runtime.utils.request import send_request_with_retry
 from openhands.runtime.utils.runtime_build import build_runtime_image
 from openhands.utils.tenacity_stop import stop_if_should_exit
 
@@ -336,11 +334,12 @@ class EventStreamRuntime(Runtime):
         if not (self.log_buffer and self.log_buffer.client_ready):
             raise RuntimeError('Runtime client is not ready.')
 
-        response = send_request(
+        response = send_request_with_retry(
             self.session,
             'GET',
             f'{self.api_url}/alive',
             retry_exceptions=[ConnectionRefusedError],
+            timeout=300,  # 5 minutes gives the container time to be alive 🧟‍♂️
         )
         if response.status_code == 200:
             return
@@ -424,7 +423,7 @@ class EventStreamRuntime(Runtime):
             assert action.timeout is not None
 
             try:
-                response = send_request(
+                response = send_request_with_retry(
                     self.session,
                     'POST',
                     f'{self.api_url}/execute_action',
@@ -505,12 +504,13 @@ class EventStreamRuntime(Runtime):
 
             params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
 
-            response = send_request(
+            response = send_request_with_retry(
                 self.session,
                 'POST',
                 f'{self.api_url}/upload_file',
                 files=upload_data,
                 params=params,
+                timeout=300,
             )
             if response.status_code == 200:
                 return
@@ -539,11 +539,12 @@ class EventStreamRuntime(Runtime):
             if path is not None:
                 data['path'] = path
 
-            response = send_request(
+            response = send_request_with_retry(
                 self.session,
                 'POST',
                 f'{self.api_url}/list_files',
                 json=data,
+                timeout=30,  # 30 seconds because the container should already be alive
             )
             if response.status_code == 200:
                 response_json = response.json()

+ 14 - 9
openhands/runtime/remote/runtime.py

@@ -39,7 +39,7 @@ from openhands.runtime.runtime import Runtime
 from openhands.runtime.utils.request import (
     DEFAULT_RETRY_EXCEPTIONS,
     is_404_error,
-    send_request,
+    send_request_with_retry,
 )
 from openhands.runtime.utils.runtime_build import build_runtime_image
 from openhands.utils.tenacity_stop import stop_if_should_exit
@@ -97,10 +97,11 @@ class RemoteRuntime(Runtime):
                 f'Building remote runtime with base image: {self.config.sandbox.base_container_image}'
             )
             logger.debug(f'RemoteRuntime `{sid}` config:\n{self.config}')
-            response = send_request(
+            response = send_request_with_retry(
                 self.session,
                 'GET',
                 f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
+                timeout=30,
             )
             response_json = response.json()
             registry_prefix = response_json['registry_prefix']
@@ -125,11 +126,12 @@ class RemoteRuntime(Runtime):
                 force_rebuild=self.config.sandbox.force_rebuild_runtime,
             )
 
-            response = send_request(
+            response = send_request_with_retry(
                 self.session,
                 'GET',
                 f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
                 params={'image': self.container_image},
+                timeout=30,
             )
             if response.status_code != 200 or not response.json()['exists']:
                 raise RuntimeError(
@@ -164,11 +166,12 @@ class RemoteRuntime(Runtime):
 
         self.send_status_message('STATUS$WAITING_FOR_CLIENT')
         # Start the sandbox using the /start endpoint
-        response = send_request(
+        response = send_request_with_retry(
             self.session,
             'POST',
             f'{self.config.sandbox.remote_runtime_api_url}/start',
             json=start_request,
+            timeout=300,
         )
         if response.status_code != 201:
             raise RuntimeError(f'Failed to start sandbox: {response.text}')
@@ -216,7 +219,7 @@ class RemoteRuntime(Runtime):
     )
     def _wait_until_alive(self):
         logger.info(f'Waiting for runtime to be alive at url: {self.runtime_url}')
-        response = send_request(
+        response = send_request_with_retry(
             self.session,
             'GET',
             f'{self.runtime_url}/alive',
@@ -235,7 +238,7 @@ class RemoteRuntime(Runtime):
     def close(self, timeout: int = 10):
         if self.runtime_id:
             try:
-                response = send_request(
+                response = send_request_with_retry(
                     self.session,
                     'POST',
                     f'{self.config.sandbox.remote_runtime_api_url}/stop',
@@ -273,7 +276,7 @@ class RemoteRuntime(Runtime):
                 logger.info('Executing action')
                 request_body = {'action': event_to_dict(action)}
                 logger.debug(f'Request body: {request_body}')
-                response = send_request(
+                response = send_request_with_retry(
                     self.session,
                     'POST',
                     f'{self.runtime_url}/execute_action',
@@ -351,7 +354,7 @@ class RemoteRuntime(Runtime):
 
             params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
 
-            response = send_request(
+            response = send_request_with_retry(
                 self.session,
                 'POST',
                 f'{self.runtime_url}/upload_file',
@@ -360,6 +363,7 @@ class RemoteRuntime(Runtime):
                 retry_exceptions=list(
                     filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS)
                 ),
+                timeout=300,
             )
             if response.status_code == 200:
                 logger.info(
@@ -385,7 +389,7 @@ class RemoteRuntime(Runtime):
             if path is not None:
                 data['path'] = path
 
-            response = send_request(
+            response = send_request_with_retry(
                 self.session,
                 'POST',
                 f'{self.runtime_url}/list_files',
@@ -393,6 +397,7 @@ class RemoteRuntime(Runtime):
                 retry_exceptions=list(
                     filter(lambda e: e != TimeoutError, DEFAULT_RETRY_EXCEPTIONS)
                 ),
+                timeout=30,  # The runtime sbould already be running here
             )
             if response.status_code == 200:
                 response_json = response.json()

+ 3 - 3
openhands/runtime/utils/request.py

@@ -33,13 +33,13 @@ DEFAULT_RETRY_EXCEPTIONS = [
 ]
 
 
-def send_request(
+def send_request_with_retry(
     session: requests.Session,
     method: str,
     url: str,
+    timeout: int,
     retry_exceptions: list[Type[Exception]] | None = None,
     retry_fns: list[Callable[[Exception], bool]] | None = None,
-    timeout: int = 120,
     **kwargs: Any,
 ) -> requests.Response:
     exceptions_to_catch = retry_exceptions or DEFAULT_RETRY_EXCEPTIONS
@@ -54,7 +54,7 @@ def send_request(
 
     @retry(
         stop=stop_after_delay(timeout) | stop_if_should_exit(),
-        wait=wait_exponential(multiplier=1, min=4, max=60),
+        wait=wait_exponential(multiplier=1, min=4, max=20),
         retry=retry_condition,
         reraise=True,
     )