amazon-textract-response-parser
amazon-textract-response-parser copied to clipboard
Intermittent error "Message: 'PAGE - <UUID> does not have ids with CHILD relationship.'"
While parsing documents with AWS Textract and using 1.0.2 version of the parser, we observe the following error. It appears to be somewhat random. What could be the reason for this issue?
--- Logging error ---
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 146, in _get_relationship_ids
for rel in block_json["Relationships"]
~~~~~~~~~~^^^^^^^^^^^^^^^^^
KeyError: 'Relationships'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.11/logging/__init__.py", line 449, in format
return self._format(record)
^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 445, in _format
return self._fmt % values
~~~~~~~~~~^~~~~~~~
KeyError: 'request_id'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.11/logging/__init__.py", line 1110, in emit
msg = self.format(record)
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 953, in format
return fmt.format(record)
^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 690, in format
s = self.formatMessage(record)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 659, in formatMessage
return self._style.format(record)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 451, in format
raise ValueError('Formatting field not found in record: %s' % e)
ValueError: Formatting field not found in record: 'request_id'
Call stack:
File "/usr/lib/python3.11/threading.py", line 1002, in _bootstrap
self._bootstrap_inner()
File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
self.run()
File "/usr/lib/python3.11/threading.py", line 982, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.11/concurrent/futures/thread.py", line 83, in _worker
work_item.run()
File "/usr/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/usr/src/genai/common/utils/context_utils.py", line 15, in wrapper
return context.run(func, *args, **kwargs)
File "/usr/src/genai/rag/orchestrator/rag_orchestrator.py", line 188, in __add_document_to_vector_store
document_pages = self.__document_parser.parse_documents_for(file_path=document_file_path,
File "/usr/src/genai/common/document/parser.py", line 66, in parse_documents_for
return self.__get_pdf_parser(file_path=file_path, parser_config=parser_config).load()
File "/usr/src/genai/common/document/pdf_loader_wrapper.py", line 29, in load
file_path=self.__file_path).load()
File "/usr/src/genai/common/document/aws_textract_document_loader_wrapper.py", line 31, in load
).load()
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/pdf.py", line 657, in load
return list(self.lazy_load())
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/pdf.py", line 678, in lazy_load
yield from self.parser.parse(blob)
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/base.py", line 111, in parse
return list(self.lazy_parse(blob))
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/parsers/pdf.py", line 528, in lazy_parse
document = self.textractor.Document.open(textract_response_json)
File "/usr/local/lib/python3.11/dist-packages/textractor/entities/document.py", line 64, in open
return response_parser.parse(fp)
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 1437, in parse
return parse_document_api_response(response)
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 1159, in parse_document_api_response
pages, page_elements = _create_page_objects(response)
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 176, in _create_page_objects
page_children = _get_relationship_ids(page_json, relationship="CHILD")
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 150, in _get_relationship_ids
logging.info(
Message: 'PAGE - b30acdb8-5ce8-4574-b0af-3e8e5dfc7efe does not have ids with CHILD relationship.'
Arguments: ()
--- Logging error ---
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 146, in _get_relationship_ids
for rel in block_json["Relationships"]
~~~~~~~~~~^^^^^^^^^^^^^^^^^
KeyError: 'Relationships'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.11/logging/__init__.py", line 449, in format
return self._format(record)
^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 445, in _format
return self._fmt % values
~~~~~~~~~~^~~~~~~~
KeyError: 'request_id'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.11/logging/__init__.py", line 1110, in emit
msg = self.format(record)
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 953, in format
return fmt.format(record)
^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 690, in format
s = self.formatMessage(record)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 659, in formatMessage
return self._style.format(record)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 451, in format
raise ValueError('Formatting field not found in record: %s' % e)
ValueError: Formatting field not found in record: 'request_id'
Call stack:
File "/usr/lib/python3.11/threading.py", line 1002, in _bootstrap
self._bootstrap_inner()
File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
self.run()
File "/usr/lib/python3.11/threading.py", line 982, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.11/concurrent/futures/thread.py", line 83, in _worker
work_item.run()
File "/usr/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/usr/src/genai/common/utils/context_utils.py", line 15, in wrapper
return context.run(func, *args, **kwargs)
File "/usr/src/genai/rag/orchestrator/rag_orchestrator.py", line 188, in __add_document_to_vector_store
document_pages = self.__document_parser.parse_documents_for(file_path=document_file_path,
File "/usr/src/genai/common/document/parser.py", line 66, in parse_documents_for
return self.__get_pdf_parser(file_path=file_path, parser_config=parser_config).load()
File "/usr/src/genai/common/document/pdf_loader_wrapper.py", line 29, in load
file_path=self.__file_path).load()
File "/usr/src/genai/common/document/aws_textract_document_loader_wrapper.py", line 31, in load
).load()
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/pdf.py", line 657, in load
return list(self.lazy_load())
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/pdf.py", line 678, in lazy_load
yield from self.parser.parse(blob)
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/base.py", line 111, in parse
return list(self.lazy_parse(blob))
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/parsers/pdf.py", line 528, in lazy_parse
document = self.textractor.Document.open(textract_response_json)
File "/usr/local/lib/python3.11/dist-packages/textractor/entities/document.py", line 64, in open
return response_parser.parse(fp)
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 1437, in parse
return parse_document_api_response(response)
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 1159, in parse_document_api_response
pages, page_elements = _create_page_objects(response)
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 176, in _create_page_objects
page_children = _get_relationship_ids(page_json, relationship="CHILD")
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 150, in _get_relationship_ids
logging.info(
Message: 'PAGE - f2068f0c-94f5-46a0-af4d-144c32528c35 does not have ids with CHILD relationship.'
Arguments: ()
--- Logging error ---
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 146, in _get_relationship_ids
for rel in block_json["Relationships"]
~~~~~~~~~~^^^^^^^^^^^^^^^^^
KeyError: 'Relationships'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.11/logging/__init__.py", line 449, in format
return self._format(record)
^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 445, in _format
return self._fmt % values
~~~~~~~~~~^~~~~~~~
KeyError: 'request_id'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.11/logging/__init__.py", line 1110, in emit
msg = self.format(record)
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 953, in format
return fmt.format(record)
^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 690, in format
s = self.formatMessage(record)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 659, in formatMessage
return self._style.format(record)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.11/logging/__init__.py", line 451, in format
raise ValueError('Formatting field not found in record: %s' % e)
ValueError: Formatting field not found in record: 'request_id'
Call stack:
File "/usr/lib/python3.11/threading.py", line 1002, in _bootstrap
self._bootstrap_inner()
File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
self.run()
File "/usr/lib/python3.11/threading.py", line 982, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.11/concurrent/futures/thread.py", line 83, in _worker
work_item.run()
File "/usr/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/usr/src/genai/common/utils/context_utils.py", line 15, in wrapper
return context.run(func, *args, **kwargs)
File "/usr/src/genai/rag/orchestrator/rag_orchestrator.py", line 188, in __add_document_to_vector_store
document_pages = self.__document_parser.parse_documents_for(file_path=document_file_path,
File "/usr/src/genai/common/document/parser.py", line 66, in parse_documents_for
return self.__get_pdf_parser(file_path=file_path, parser_config=parser_config).load()
File "/usr/src/genai/common/document/pdf_loader_wrapper.py", line 29, in load
file_path=self.__file_path).load()
File "/usr/src/genai/common/document/aws_textract_document_loader_wrapper.py", line 31, in load
).load()
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/pdf.py", line 657, in load
return list(self.lazy_load())
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/pdf.py", line 678, in lazy_load
yield from self.parser.parse(blob)
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/base.py", line 111, in parse
return list(self.lazy_parse(blob))
File "/usr/local/lib/python3.11/dist-packages/langchain_community/document_loaders/parsers/pdf.py", line 528, in lazy_parse
document = self.textractor.Document.open(textract_response_json)
File "/usr/local/lib/python3.11/dist-packages/textractor/entities/document.py", line 64, in open
return response_parser.parse(fp)
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 1437, in parse
return parse_document_api_response(response)
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 1159, in parse_document_api_response
pages, page_elements = _create_page_objects(response)
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 176, in _create_page_objects
page_children = _get_relationship_ids(page_json, relationship="CHILD")
File "/usr/local/lib/python3.11/dist-packages/textractor/parsers/response_parser.py", line 150, in _get_relationship_ids
logging.info(
Message: 'PAGE - 780480c8-9f5a-4aa4-bbd2-5f509c1fe83c does not have ids with CHILD relationship.'
Arguments: ()