|
|
@@ -138,6 +138,7 @@ class RemoteRuntime(Runtime):
|
|
|
response = self._send_request(
|
|
|
'GET',
|
|
|
f'{self.config.sandbox.remote_runtime_api_url}/sessions/{self.sid}',
|
|
|
+ is_retry=False,
|
|
|
timeout=5,
|
|
|
)
|
|
|
except requests.HTTPError as e:
|
|
|
@@ -168,6 +169,7 @@ class RemoteRuntime(Runtime):
|
|
|
response = self._send_request(
|
|
|
'GET',
|
|
|
f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
|
|
|
+ is_retry=False,
|
|
|
timeout=10,
|
|
|
)
|
|
|
response_json = response.json()
|
|
|
@@ -198,6 +200,7 @@ class RemoteRuntime(Runtime):
|
|
|
response = self._send_request(
|
|
|
'GET',
|
|
|
f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
|
|
|
+ is_retry=False,
|
|
|
params={'image': self.container_image},
|
|
|
timeout=10,
|
|
|
)
|
|
|
@@ -234,6 +237,7 @@ class RemoteRuntime(Runtime):
|
|
|
response = self._send_request(
|
|
|
'POST',
|
|
|
f'{self.config.sandbox.remote_runtime_api_url}/start',
|
|
|
+ is_retry=False,
|
|
|
json=start_request,
|
|
|
)
|
|
|
self._parse_runtime_response(response)
|
|
|
@@ -246,6 +250,7 @@ class RemoteRuntime(Runtime):
|
|
|
self._send_request(
|
|
|
'POST',
|
|
|
f'{self.config.sandbox.remote_runtime_api_url}/resume',
|
|
|
+ is_retry=False,
|
|
|
json={'runtime_id': self.runtime_id},
|
|
|
timeout=30,
|
|
|
)
|
|
|
@@ -283,14 +288,12 @@ class RemoteRuntime(Runtime):
|
|
|
assert runtime_data['runtime_id'] == self.runtime_id
|
|
|
assert 'pod_status' in runtime_data
|
|
|
pod_status = runtime_data['pod_status']
|
|
|
+ self.log('debug', runtime_data)
|
|
|
+ self.log('debug', f'Pod status: {pod_status}')
|
|
|
|
|
|
# FIXME: We should fix it at the backend of /start endpoint, make sure
|
|
|
# the pod is created before returning the response.
|
|
|
# Retry a period of time to give the cluster time to start the pod
|
|
|
- if pod_status == 'Not Found':
|
|
|
- raise RuntimeNotReadyError(
|
|
|
- f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
|
|
|
- )
|
|
|
if pod_status == 'Ready':
|
|
|
try:
|
|
|
self._send_request(
|
|
|
@@ -305,12 +308,23 @@ class RemoteRuntime(Runtime):
|
|
|
f'Runtime /alive failed to respond with 200: {e}'
|
|
|
)
|
|
|
return
|
|
|
- if pod_status in ('Failed', 'Unknown'):
|
|
|
+ elif (
|
|
|
+ pod_status == 'Not Found'
|
|
|
+ or pod_status == 'Pending'
|
|
|
+ or pod_status == 'Running'
|
|
|
+ ): # nb: Running is not yet Ready
|
|
|
+ raise RuntimeNotReadyError(
|
|
|
+ f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
|
|
|
+ )
|
|
|
+ elif pod_status in ('Failed', 'Unknown'):
|
|
|
# clean up the runtime
|
|
|
self.close()
|
|
|
raise RuntimeError(
|
|
|
f'Runtime (ID={self.runtime_id}) failed to start. Current status: {pod_status}'
|
|
|
)
|
|
|
+ else:
|
|
|
+ # Maybe this should be a hard failure, but passing through in case the API changes
|
|
|
+ self.log('warning', f'Unknown pod status: {pod_status}')
|
|
|
|
|
|
self.log(
|
|
|
'debug',
|
|
|
@@ -327,6 +341,7 @@ class RemoteRuntime(Runtime):
|
|
|
response = self._send_request(
|
|
|
'POST',
|
|
|
f'{self.config.sandbox.remote_runtime_api_url}/stop',
|
|
|
+ is_retry=False,
|
|
|
json={'runtime_id': self.runtime_id},
|
|
|
timeout=timeout,
|
|
|
)
|
|
|
@@ -342,7 +357,7 @@ class RemoteRuntime(Runtime):
|
|
|
finally:
|
|
|
self.session.close()
|
|
|
|
|
|
- def run_action(self, action: Action) -> Observation:
|
|
|
+ def run_action(self, action: Action, is_retry: bool = False) -> Observation:
|
|
|
if action.timeout is None:
|
|
|
action.timeout = self.config.sandbox.timeout
|
|
|
if isinstance(action, FileEditAction):
|
|
|
@@ -367,6 +382,7 @@ class RemoteRuntime(Runtime):
|
|
|
response = self._send_request(
|
|
|
'POST',
|
|
|
f'{self.runtime_url}/execute_action',
|
|
|
+ is_retry=False,
|
|
|
json=request_body,
|
|
|
# wait a few more seconds to get the timeout error from client side
|
|
|
timeout=action.timeout + 5,
|
|
|
@@ -380,7 +396,7 @@ class RemoteRuntime(Runtime):
|
|
|
)
|
|
|
return obs
|
|
|
|
|
|
- def _send_request(self, method, url, **kwargs):
|
|
|
+ def _send_request(self, method, url, is_retry=False, **kwargs):
|
|
|
is_runtime_request = self.runtime_url and self.runtime_url in url
|
|
|
try:
|
|
|
return send_request(self.session, method, url, **kwargs)
|
|
|
@@ -392,6 +408,15 @@ class RemoteRuntime(Runtime):
|
|
|
raise RuntimeDisconnectedError(
|
|
|
f'404 error while connecting to {self.runtime_url}'
|
|
|
)
|
|
|
+ elif is_runtime_request and e.response.status_code == 503:
|
|
|
+ if not is_retry:
|
|
|
+ self.log('warning', 'Runtime appears to be paused. Resuming...')
|
|
|
+ self._resume_runtime()
|
|
|
+ self._wait_until_alive()
|
|
|
+ return self._send_request(method, url, True, **kwargs)
|
|
|
+ else:
|
|
|
+ raise e
|
|
|
+
|
|
|
else:
|
|
|
raise e
|
|
|
|
|
|
@@ -444,6 +469,7 @@ class RemoteRuntime(Runtime):
|
|
|
response = self._send_request(
|
|
|
'POST',
|
|
|
f'{self.runtime_url}/upload_file',
|
|
|
+ is_retry=False,
|
|
|
files=upload_data,
|
|
|
params=params,
|
|
|
timeout=300,
|
|
|
@@ -467,6 +493,7 @@ class RemoteRuntime(Runtime):
|
|
|
response = self._send_request(
|
|
|
'POST',
|
|
|
f'{self.runtime_url}/list_files',
|
|
|
+ is_retry=False,
|
|
|
json=data,
|
|
|
timeout=30,
|
|
|
)
|
|
|
@@ -480,6 +507,7 @@ class RemoteRuntime(Runtime):
|
|
|
response = self._send_request(
|
|
|
'GET',
|
|
|
f'{self.runtime_url}/download_files',
|
|
|
+ is_retry=False,
|
|
|
params=params,
|
|
|
stream=True,
|
|
|
timeout=30,
|