Bläddra i källkod

Fix resume runtime after a pause (#4904)

Robert Brennan 1 år sedan
förälder
incheckning
0633a99298
1 ändrade filer med 35 tillägg och 7 borttagningar
  1. 35 7
      openhands/runtime/impl/remote/remote_runtime.py

+ 35 - 7
openhands/runtime/impl/remote/remote_runtime.py

@@ -138,6 +138,7 @@ class RemoteRuntime(Runtime):
             response = self._send_request(
                 'GET',
                 f'{self.config.sandbox.remote_runtime_api_url}/sessions/{self.sid}',
+                is_retry=False,
                 timeout=5,
             )
         except requests.HTTPError as e:
@@ -168,6 +169,7 @@ class RemoteRuntime(Runtime):
         response = self._send_request(
             'GET',
             f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
+            is_retry=False,
             timeout=10,
         )
         response_json = response.json()
@@ -198,6 +200,7 @@ class RemoteRuntime(Runtime):
         response = self._send_request(
             'GET',
             f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
+            is_retry=False,
             params={'image': self.container_image},
             timeout=10,
         )
@@ -234,6 +237,7 @@ class RemoteRuntime(Runtime):
         response = self._send_request(
             'POST',
             f'{self.config.sandbox.remote_runtime_api_url}/start',
+            is_retry=False,
             json=start_request,
         )
         self._parse_runtime_response(response)
@@ -246,6 +250,7 @@ class RemoteRuntime(Runtime):
         self._send_request(
             'POST',
             f'{self.config.sandbox.remote_runtime_api_url}/resume',
+            is_retry=False,
             json={'runtime_id': self.runtime_id},
             timeout=30,
         )
@@ -283,14 +288,12 @@ class RemoteRuntime(Runtime):
         assert runtime_data['runtime_id'] == self.runtime_id
         assert 'pod_status' in runtime_data
         pod_status = runtime_data['pod_status']
+        self.log('debug', runtime_data)
+        self.log('debug', f'Pod status: {pod_status}')
 
         # FIXME: We should fix it at the backend of /start endpoint, make sure
         # the pod is created before returning the response.
         # Retry a period of time to give the cluster time to start the pod
-        if pod_status == 'Not Found':
-            raise RuntimeNotReadyError(
-                f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
-            )
         if pod_status == 'Ready':
             try:
                 self._send_request(
@@ -305,12 +308,23 @@ class RemoteRuntime(Runtime):
                     f'Runtime /alive failed to respond with 200: {e}'
                 )
             return
-        if pod_status in ('Failed', 'Unknown'):
+        elif (
+            pod_status == 'Not Found'
+            or pod_status == 'Pending'
+            or pod_status == 'Running'
+        ):  # nb: Running is not yet Ready
+            raise RuntimeNotReadyError(
+                f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
+            )
+        elif pod_status in ('Failed', 'Unknown'):
             # clean up the runtime
             self.close()
             raise RuntimeError(
                 f'Runtime (ID={self.runtime_id}) failed to start. Current status: {pod_status}'
             )
+        else:
+            # Maybe this should be a hard failure, but passing through in case the API changes
+            self.log('warning', f'Unknown pod status: {pod_status}')
 
         self.log(
             'debug',
@@ -327,6 +341,7 @@ class RemoteRuntime(Runtime):
                 response = self._send_request(
                     'POST',
                     f'{self.config.sandbox.remote_runtime_api_url}/stop',
+                    is_retry=False,
                     json={'runtime_id': self.runtime_id},
                     timeout=timeout,
                 )
@@ -342,7 +357,7 @@ class RemoteRuntime(Runtime):
             finally:
                 self.session.close()
 
-    def run_action(self, action: Action) -> Observation:
+    def run_action(self, action: Action, is_retry: bool = False) -> Observation:
         if action.timeout is None:
             action.timeout = self.config.sandbox.timeout
         if isinstance(action, FileEditAction):
@@ -367,6 +382,7 @@ class RemoteRuntime(Runtime):
                 response = self._send_request(
                     'POST',
                     f'{self.runtime_url}/execute_action',
+                    is_retry=False,
                     json=request_body,
                     # wait a few more seconds to get the timeout error from client side
                     timeout=action.timeout + 5,
@@ -380,7 +396,7 @@ class RemoteRuntime(Runtime):
                 )
             return obs
 
-    def _send_request(self, method, url, **kwargs):
+    def _send_request(self, method, url, is_retry=False, **kwargs):
         is_runtime_request = self.runtime_url and self.runtime_url in url
         try:
             return send_request(self.session, method, url, **kwargs)
@@ -392,6 +408,15 @@ class RemoteRuntime(Runtime):
                 raise RuntimeDisconnectedError(
                     f'404 error while connecting to {self.runtime_url}'
                 )
+            elif is_runtime_request and e.response.status_code == 503:
+                if not is_retry:
+                    self.log('warning', 'Runtime appears to be paused. Resuming...')
+                    self._resume_runtime()
+                    self._wait_until_alive()
+                    return self._send_request(method, url, True, **kwargs)
+                else:
+                    raise e
+
             else:
                 raise e
 
@@ -444,6 +469,7 @@ class RemoteRuntime(Runtime):
             response = self._send_request(
                 'POST',
                 f'{self.runtime_url}/upload_file',
+                is_retry=False,
                 files=upload_data,
                 params=params,
                 timeout=300,
@@ -467,6 +493,7 @@ class RemoteRuntime(Runtime):
         response = self._send_request(
             'POST',
             f'{self.runtime_url}/list_files',
+            is_retry=False,
             json=data,
             timeout=30,
         )
@@ -480,6 +507,7 @@ class RemoteRuntime(Runtime):
         response = self._send_request(
             'GET',
             f'{self.runtime_url}/download_files',
+            is_retry=False,
             params=params,
             stream=True,
             timeout=30,