tika-python
tika-python copied to clipboard
How to fix ReadTimeout: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60)
Upon installation,
pip install tika
When attempting:
In [21]: import tika
...: tika.initVM()
...: from tika import parser
In [22]: parsed = parser.from_file(file_path)
I get
---------------------------------------------------------------------------
timeout Traceback (most recent call last)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:466, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
462 except BaseException as e:
463 # Remove the TypeError from the exception chain in
464 # Python 3 (including for exceptions like SystemExit).
465 # Otherwise it looks like a bug in the code.
--> 466 six.raise_from(e, None)
467 except (SocketTimeout, BaseSSLError, SocketError) as e:
File <string>:3, in raise_from(value, from_value)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:461, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
460 try:
--> 461 httplib_response = conn.getresponse()
462 except BaseException as e:
463 # Remove the TypeError from the exception chain in
464 # Python 3 (including for exceptions like SystemExit).
465 # Otherwise it looks like a bug in the code.
File ~/anaconda3/envs/master/lib/python3.8/http/client.py:1348, in HTTPConnection.getresponse(self)
1347 try:
-> 1348 response.begin()
1349 except ConnectionError:
File ~/anaconda3/envs/master/lib/python3.8/http/client.py:316, in HTTPResponse.begin(self)
315 while True:
--> 316 version, status, reason = self._read_status()
317 if status != CONTINUE:
File ~/anaconda3/envs/master/lib/python3.8/http/client.py:277, in HTTPResponse._read_status(self)
276 def _read_status(self):
--> 277 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
278 if len(line) > _MAXLINE:
File ~/anaconda3/envs/master/lib/python3.8/socket.py:669, in SocketIO.readinto(self, b)
668 try:
--> 669 return self._sock.recv_into(b)
670 except timeout:
timeout: timed out
During handling of the above exception, another exception occurred:
ReadTimeoutError Traceback (most recent call last)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
485 try:
--> 486 resp = conn.urlopen(
487 method=request.method,
488 url=url,
489 body=request.body,
490 headers=request.headers,
491 redirect=False,
492 assert_same_host=False,
493 preload_content=False,
494 decode_content=False,
495 retries=self.max_retries,
496 timeout=timeout,
497 chunked=chunked,
498 )
500 except (ProtocolError, OSError) as err:
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:798, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
796 e = ProtocolError("Connection aborted.", e)
--> 798 retries = retries.increment(
799 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
800 )
801 retries.sleep()
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
549 if read is False or not self._is_method_retryable(method):
--> 550 raise six.reraise(type(error), error, _stacktrace)
551 elif read is not None:
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/packages/six.py:770, in reraise(tp, value, tb)
769 raise value.with_traceback(tb)
--> 770 raise value
771 finally:
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:714, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
713 # Make the request on the httplib connection object.
--> 714 httplib_response = self._make_request(
715 conn,
716 method,
717 url,
718 timeout=timeout_obj,
719 body=body,
720 headers=headers,
721 chunked=chunked,
722 )
724 # If we're going to release the connection in ``finally:``, then
725 # the response doesn't need to know about the connection. Otherwise
726 # it will also try to release it and we'll have a double-release
727 # mess.
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:468, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
467 except (SocketTimeout, BaseSSLError, SocketError) as e:
--> 468 self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
469 raise
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:357, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value)
356 if isinstance(err, SocketTimeout):
--> 357 raise ReadTimeoutError(
358 self, url, "Read timed out. (read timeout=%s)" % timeout_value
359 )
361 # See the above comment about EAGAIN in Python 3. In Python 2 we have
362 # to specifically catch it and throw the timeout error
ReadTimeoutError: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60)
During handling of the above exception, another exception occurred:
ReadTimeout Traceback (most recent call last)
Cell In[22], line 1
----> 1 parsed = parser.from_file(file_path)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/parser.py:40, in from_file(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions, raw_response)
24 '''
25 Parses a file for metadata and content
26 :param filename: path to file which needs to be parsed or binary file using open(path,'rb')
(...)
37 'content' has a str value and metadata has a dict type value.
38 '''
39 if not xmlContent:
---> 40 output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
41 else:
42 output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
43 headers=headers, config_path=config_path, requestOptions=requestOptions)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:337, in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions)
335 headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
336 with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f:
--> 337 status, response = callServer('put', serverEndpoint, service, f,
338 headers, verbose, tikaServerJar, config_path=config_path,
339 rawResponse=rawResponse, requestOptions=requestOptions)
341 if file_type == 'remote': os.unlink(path)
342 return (status, response)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:555, in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions)
552 effectiveRequestOptions = requestOptionsDefault.copy()
553 effectiveRequestOptions.update(requestOptions)
--> 555 resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions)
557 if verbose:
558 print(sys.stderr, "Request headers: ", headers)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:130, in put(url, data, **kwargs)
118 def put(url, data=None, **kwargs):
119 r"""Sends a PUT request.
120
121 :param url: URL for the new :class:`Request` object.
(...)
127 :rtype: requests.Response
128 """
--> 130 return request("put", url, data=data, **kwargs)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:59, in request(method, url, **kwargs)
55 # By using the 'with' statement we are sure the session is closed, thus we
56 # avoid leaving sockets open which can trigger a ResourceWarning in some
57 # cases, and look like a memory leak in others.
58 with sessions.Session() as session:
---> 59 return session.request(method=method, url=url, **kwargs)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
584 send_kwargs = {
585 "timeout": timeout,
586 "allow_redirects": allow_redirects,
587 }
588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
591 return resp
File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
700 start = preferred_clock()
702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
705 # Total elapsed time of the request (approximately)
706 elapsed = preferred_clock() - start
File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:532, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
530 raise SSLError(e, request=request)
531 elif isinstance(e, ReadTimeoutError):
--> 532 raise ReadTimeout(e, request=request)
533 elif isinstance(e, _InvalidHeader):
534 raise InvalidHeader(e, request=request)
ReadTimeout: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60)
In [23]:
How can I overcome it?
Nevermind, I have missed
TIKA_SERVER_JAR="file:////tika-server-standard.jar
After setting this environment variable, it worked.