I am quite new to Neptune, and am trying to run (and log) some trainings on 1 GPU. Everything went smoothly for ~20 hours, but then I got an error (Failed to send channel value.
). I am wondering, what might have caused this.
It happened at the same time to all 3 of my jobs. I see a few possibilities:
20:17:01 | E1023 04:07:48.158644 35187409351088 channels_values_sender.py:164] Failed to send channel value.
-- | --
20:17:01 | Traceback (most recent call last):
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 210, in wrapper
-- | --
20:17:01 | return func(*args, **kwargs)
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/backends/hosted_neptune_backend.py", line 556, in send_channels_values
20:17:01 | channelsValues=input_channels_values
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 239, in response
20:17:01 | six.reraise(*sys.exc_info())
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/six.py", line 693, in reraise
-- | --
20:17:01 | raise value
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 200, in response
20:17:01 | swagger_result = self._get_swagger_result(incoming_response)
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 124, in wrapper
20:17:01 | return func(self, *args, **kwargs)
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 303, in _get_swagger_result
20:17:01 | self.request_config.response_callbacks,
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 334, in unmarshal_response
20:17:01 | raise_on_unexpected(incoming_response)
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 408, in raise_on_unexpected
20:17:01 | raise make_http_exception(response=http_response)
20:17:01 | bravado.exception.HTTPInternalServerError: 500 : {"code":500,"errorType":"INTERNAL_SERVER_ERROR","title":"Internal Server Error (2fb6177e655)"}
20:17:01 | During handling of the above exception, another exception occurred:
20:17:01 | Traceback (most recent call last):
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
20:17:01 | self._experiment._send_channels_values(channels_with_values)
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/experiments.py", line 1138, in _send_channels_values
20:17:01 | self._backend.send_channels_values(self, channels_with_values)
20:17:01 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 221, in wrapper
20:17:01 | raise ServerError()
20:17:01 | neptune.api_exceptions.ServerError: Server error. Please try again later.
20:18:56 | Traceback (most recent call last):
-- | --
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 210, in wrapper
20:18:56 | return func(*args, **kwargs)
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/backends/hosted_neptune_backend.py", line 556, in send_channels_values
20:18:56 | channelsValues=input_channels_values
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 239, in response
20:18:56 | six.reraise(*sys.exc_info())
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/six.py", line 693, in reraise
20:18:56 | raise value
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 200, in response
20:18:56 | swagger_result = self._get_swagger_result(incoming_response)
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 124, in wrapper
20:18:56 | return func(self, *args, **kwargs)
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 303, in _get_swagger_result
20:18:56 | self.request_config.response_callbacks,
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 334, in unmarshal_response
20:18:56 | raise_on_unexpected(incoming_response)
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 408, in raise_on_unexpected
20:18:56 | raise make_http_exception(response=http_response)
20:18:56 | bravado.exception.HTTPInternalServerError: 500 : {"errorType":"INTERNAL_SERVER_ERROR","code":500,"title":"Internal Server Error (e50dc164b5c)"}
20:18:56 | During handling of the above exception, another exception occurred:
20:18:56 | Traceback (most recent call last):
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
20:18:56 | self._experiment._send_channels_values(channels_with_values)
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/experiments.py", line 1138, in _send_channels_values
20:18:56 | self._backend.send_channels_values(self, channels_with_values)
20:18:56 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 221, in wrapper
20:18:56 | raise ServerError()
20:18:56 | neptune.api_exceptions.ServerError: Server error. Please try again later.
20:19:02 | E1023 04:09:49.150690 35187409351088 channels_values_sender.py:164] Failed to send channel value.
20:19:02 | Traceback (most recent call last):
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 210, in wrapper
20:19:02 | return func(*args, **kwargs)
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/backends/hosted_neptune_backend.py", line 556, in send_channels_values
20:19:02 | channelsValues=input_channels_values
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 239, in response
20:19:02 | six.reraise(*sys.exc_info())
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/six.py", line 693, in reraise
20:19:02 | raise value
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 200, in response
20:19:02 | swagger_result = self._get_swagger_result(incoming_response)
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 124, in wrapper
20:19:02 | return func(self, *args, **kwargs)
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 303, in _get_swagger_result
20:19:02 | self.request_config.response_callbacks,
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 334, in unmarshal_response
20:19:02 | raise_on_unexpected(incoming_response)
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 408, in raise_on_unexpected
20:19:02 | raise make_http_exception(response=http_response)
20:19:02 | bravado.exception.HTTPInternalServerError: 500 : {"errorType":"INTERNAL_SERVER_ERROR","code":500,"title":"Internal Server Error (d98440aac5e)"}
20:19:02 | During handling of the above exception, another exception occurred:
20:19:02 | Traceback (most recent call last):
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
20:19:02 | self._experiment._send_channels_values(channels_with_values)
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/experiments.py", line 1138, in _send_channels_values
20:19:02 | self._backend.send_channels_values(self, channels_with_values)
20:19:02 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 221, in wrapper
20:19:02 | raise ServerError()
20:19:02 | neptune.api_exceptions.ServerError: Server error. Please try again later.
20:19:06 | E1023 04:09:53.008028 35187409351088 channels_values_sender.py:164] Failed to send channel value.
20:19:06 | Traceback (most recent call last):
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 210, in wrapper
20:19:06 | return func(*args, **kwargs)
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/backends/hosted_neptune_backend.py", line 556, in send_channels_values
20:19:06 | channelsValues=input_channels_values
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 239, in response
20:19:06 | six.reraise(*sys.exc_info())
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/six.py", line 693, in reraise
20:19:06 | raise value
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 200, in response
20:19:06 | swagger_result = self._get_swagger_result(incoming_response)
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 124, in wrapper
20:19:06 | return func(self, *args, **kwargs)
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 303, in _get_swagger_result
20:19:06 | self.request_config.response_callbacks,
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 334, in unmarshal_response
20:19:06 | raise_on_unexpected(incoming_response)
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/http_future.py", line 408, in raise_on_unexpected
20:19:06 | raise make_http_exception(response=http_response)
20:19:06 | bravado.exception.HTTPInternalServerError: 500 : {"errorType":"INTERNAL_SERVER_ERROR","code":500,"title":"Internal Server Error (b34b09c679a)"}
20:19:06 | During handling of the above exception, another exception occurred:
20:19:06 | Traceback (most recent call last):
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
20:19:06 | self._experiment._send_channels_values(channels_with_values)
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/experiments.py", line 1138, in _send_channels_values
20:19:06 | self._backend.send_channels_values(self, channels_with_values)
20:19:06 | File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 221, in wrapper
20:19:06 | raise ServerError()
20:19:06 | neptune.api_exceptions.ServerError: Server error. Please try again later.
Traceback (most recent call last):
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/threads/ping_thread.py", line 37, in run
self.__backend.ping_experiment(self.__experiment)
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 210, in wrapper
return func(*args, **kwargs)
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/internal/backends/hosted_neptune_backend.py", line 611, in ping_experiment
self.backend_swagger_client.api.pingExperiment(experimentId=experiment.internal_id).response()
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/client.py", line 279, in __call__
request_config=request_config,
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/requests_client.py", line 399, in request
self.authenticated_request(sanitized_params),
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/requests_client.py", line 440, in authenticated_request
return self.apply_authentication(requests.Request(**request_params))
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/bravado/requests_client.py", line 445, in apply_authentication
return self.authenticator.apply(request)
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/oauth.py", line 90, in apply
self.auth.refresh_token_if_needed()
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/utils.py", line 210, in wrapper
return func(*args, **kwargs)
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/oauth.py", line 51, in refresh_token_if_needed
self._refresh_token()
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/neptune/oauth.py", line 54, in _refresh_token
self.session.refresh_token(self.session.auto_refresh_url)
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/requests_oauthlib/oauth2_session.py", line 446, in refresh_token
self.token = self._client.parse_request_body_response(r.text, scope=self.scope)
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/oauthlib/oauth2/rfc6749/clients/base.py", line 421, in parse_request_body_response
self.token = parse_token_response(body, scope=scope)
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/oauthlib/oauth2/rfc6749/parameters.py", line 431, in parse_token_response
validate_token_parameters(params)
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/oauthlib/oauth2/rfc6749/parameters.py", line 438, in validate_token_parameters
raise_from_error(params.get('error'), params)
File "/gpfs/share/skynet/apps/anaconda3/envs/wmlce_env_1.6.1/lib/python3.6/site-packages/oauthlib/oauth2/rfc6749/errors.py", line 405, in raise_from_error
raise cls(**kwargs)
oauthlib.oauth2.rfc6749.errors.InvalidGrantError: (invalid_grant) Offline user session not found