Tool HTTP request, returned application/octet-stream, file type not recognized
Self Checks
- [x] This is only for bug report, if you would like to ask a question, please head to Discussions.
- [x] I have searched for existing issues search for existing issues, including closed ones.
- [x] I confirm that I am using English to submit this report (我已阅读并同意 Language Policy).
- [x] [FOR CHINESE USERS] 请务必使用英文提交 Issue,否则会被关闭。谢谢!:)
- [x] Please do not modify this template :) and fill in all the required fields.
Dify version
1.0.0
Cloud or Self Hosted
Self Hosted (Docker)
Steps to reproduce
curl --location --request POST 'http://10.27.24.233:8081/api/v1/convert/pdf/word'
--form 'fileInput=@"/Users/soya/Desktop/1219926043.pdf"'
--form 'outputFormat="docx"'
The above is the request curl,postman the returned information is as follows:
Content-Type | application/octet-stream
Content-Disposition | form-data; name="attachment"; filename="1219926043.docx"
Content-Length | 116257
The workflow is as follows:
The debug trace information is as follows: { "status_code": 200, "body": "", "headers": { "date": "Sat, 08 Mar 2025 04:42:33 GMT", "content-type": "application/octet-stream", "content-disposition": "form-data; name="attachment"; filename="1219926043.docx"", "content-length": "116257", "cache-status": "ac5c6a2e8fde;detail=mismatch", "via": "1.1 ac5c6a2e8fde (squid/6.10)", "connection": "keep-alive" }, "files": [ { "dify_model_identity": "dify__file", "id": null, "tenant_id": "5185fd0a-047c-4abf-b27f-f91ed93f6c0f", "type": "custom", "transfer_method": "tool_file", "remote_url": null, "related_id": "71aef328-2b8a-4808-b49d-5dd1079f900f", "filename": "2ff8edd9bb0d4557a2c68d2a02e98271.bin", "extension": ".bin", "mime_type": "application/octet-stream", "size": 116257, "url": "http://10.27.24.230//files/tools/71aef328-2b8a-4808-b49d-5dd1079f900f.bin?timestamp=1741408957&nonce=d0a6a819f20afe7291b99d35a98c1082&sign=T9Qu-AmUwlsvOJmomm6CrrxJaZCmQEkAl1Vc2Fwp6ng=" } ] }
✔️ Expected Behavior
{ "status_code": 200, "body": "", "headers": { "date": "Sat, 08 Mar 2025 04:42:33 GMT", "content-type": "application/octet-stream", "content-disposition": "form-data; name="attachment"; filename="1219926043.docx"", "content-length": "116257", "cache-status": "ac5c6a2e8fde;detail=mismatch", "via": "1.1 ac5c6a2e8fde (squid/6.10)", "connection": "keep-alive" }, "files": [ { "dify_model_identity": "dify__file", "id": null, "tenant_id": "5185fd0a-047c-4abf-b27f-f91ed93f6c0f", "type": "custom", "transfer_method": "tool_file", "remote_url": null, "related_id": "71aef328-2b8a-4808-b49d-5dd1079f900f", "filename": "2ff8edd9bb0d4557a2c68d2a02e98271.docx", "extension": ".docx", "mime_type": "application/octet-stream", "size": 116257, "url": "http://10.27.24.230//files/tools/71aef328-2b8a-4808-b49d-5dd1079f900f.bin?timestamp=1741408957&nonce=d0a6a819f20afe7291b99d35a98c1082&sign=T9Qu-AmUwlsvOJmomm6CrrxJaZCmQEkAl1Vc2Fwp6ng=" } ] }
❌ Actual Behavior
{ "status_code": 200, "body": "", "headers": { "date": "Sat, 08 Mar 2025 04:42:33 GMT", "content-type": "application/octet-stream", "content-disposition": "form-data; name="attachment"; filename="1219926043.docx"", "content-length": "116257", "cache-status": "ac5c6a2e8fde;detail=mismatch", "via": "1.1 ac5c6a2e8fde (squid/6.10)", "connection": "keep-alive" }, "files": [ { "dify_model_identity": "dify__file", "id": null, "tenant_id": "5185fd0a-047c-4abf-b27f-f91ed93f6c0f", "type": "custom", "transfer_method": "tool_file", "remote_url": null, "related_id": "71aef328-2b8a-4808-b49d-5dd1079f900f", "filename": "2ff8edd9bb0d4557a2c68d2a02e98271.bin", "extension": ".bin", "mime_type": "application/octet-stream", "size": 116257, "url": "http://10.27.24.230//files/tools/71aef328-2b8a-4808-b49d-5dd1079f900f.bin?timestamp=1741408957&nonce=d0a6a819f20afe7291b99d35a98c1082&sign=T9Qu-AmUwlsvOJmomm6CrrxJaZCmQEkAl1Vc2Fwp6ng=" } ] }
How do I get the tool http request recognition extension to docx?
To configure the tool HTTP request to recognize the file extension as .docx instead of .bin when the content type is application/octet-stream, you can implement a detection process that checks the Content-Disposition header for the filename and adjusts the file extension accordingly. Here's a Python snippet that demonstrates this approach:
import os
import re
import mimetypes
from uuid import uuid4
import magic
def guess_file_info_from_response(response):
url = str(response.url)
# Try to extract filename from URL
parsed_url = urllib.parse.urlparse(url)
url_path = parsed_url.path
filename = os.path.basename(url_path)
# If filename couldn't be extracted, use Content-Disposition header
if not filename:
content_disposition = response.headers.get("Content-Disposition")
if content_disposition:
filename_match = re.search(r'filename="?(.+)"?', content_disposition)
if filename_match:
filename = filename_match.group(1)
# If still no filename, generate a unique one
if not filename:
unique_name = str(uuid4())
filename = f"{unique_name}"
# Guess MIME type from filename first, then URL
mimetype, _ = mimetypes.guess_type(filename)
if mimetype is None:
mimetype, _ = mimetypes.guess_type(url)
if mimetype is None:
# If guessing fails, use Content-Type from response headers
mimetype = response.headers.get("Content-Type", "application/octet-stream")
# Use python-magic to guess MIME type if still unknown or generic
if mimetype == "application/octet-stream" and magic is not None:
try:
mimetype = magic.from_buffer(response.content[:1024], mime=True)
except magic.MagicException:
pass
extension = os.path.splitext(filename)[1]
# Ensure filename has an extension
if not extension:
extension = mimetypes.guess_extension(mimetype) or ".bin"
filename = f"{filename}{extension}"
# Handle special cases for extensions
if extension == ".bin" and mimetype == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
extension = ".docx"
filename = f"{os.path.splitext(filename)[0]}.docx"
return {
"filename": filename,
"extension": extension,
"mimetype": mimetype,
"size": int(response.headers.get("Content-Length", -1)),
}
This code uses the python-magic library to perform content-based MIME type detection and adjusts the file extension to .docx if necessary [1].
To continue talking to Dosu, mention @dosu.
Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Bug Report | Other
To configure the tool HTTP request to recognize the file extension as
.docxinstead of.binwhen the content type isapplication/octet-stream, you can implement a detection process that checks theContent-Dispositionheader for the filename and adjusts the file extension accordingly. Here's a Python snippet that demonstrates this approach:要配置工具 HTTP 请求,使其在内容类型为application/octet-stream时将文件扩展名识别为.docx而不是.bin,您可以实施一个检测过程,检查文件名的Content-Disposition标头并相应地调整文件扩展名。以下是演示此方法的 Python 代码片段:import os import re import mimetypes from uuid import uuid4 import magic
def guess_file_info_from_response(response): url = str(response.url) # Try to extract filename from URL parsed_url = urllib.parse.urlparse(url) url_path = parsed_url.path filename = os.path.basename(url_path)
# If filename couldn't be extracted, use Content-Disposition header if not filename: content_disposition = response.headers.get("Content-Disposition") if content_disposition: filename_match = re.search(r'filename="?(.+)"?', content_disposition) if filename_match: filename = filename_match.group(1) # If still no filename, generate a unique one if not filename: unique_name = str(uuid4()) filename = f"{unique_name}" # Guess MIME type from filename first, then URL mimetype, _ = mimetypes.guess_type(filename) if mimetype is None: mimetype, _ = mimetypes.guess_type(url) if mimetype is None: # If guessing fails, use Content-Type from response headers mimetype = response.headers.get("Content-Type", "application/octet-stream") # Use python-magic to guess MIME type if still unknown or generic if mimetype == "application/octet-stream" and magic is not None: try: mimetype = magic.from_buffer(response.content[:1024], mime=True) except magic.MagicException: pass extension = os.path.splitext(filename)[1] # Ensure filename has an extension if not extension: extension = mimetypes.guess_extension(mimetype) or ".bin" filename = f"{filename}{extension}" # Handle special cases for extensions if extension == ".bin" and mimetype == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": extension = ".docx" filename = f"{os.path.splitext(filename)[0]}.docx" return { "filename": filename, "extension": extension, "mimetype": mimetype, "size": int(response.headers.get("Content-Length", -1)), }This code uses the
python-magiclibrary to perform content-based MIME type detection and adjusts the file extension to.docxif necessary [1].此代码使用python-magic库执行基于内容的 MIME 类型检测,并在必要时将文件扩展名调整为.docx[1] 。To continue talking to Dosu, mention @dosu.要继续与Dosu交谈,请提及 。
Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Bug Report | Other帮助 Dosu 学习!提供反馈:很好的回应|不相关的答案|不正确的来源|太冗长|幻觉|错误报告|其他
How to configure the detection process?
To configure the detection process to ensure the tool HTTP request recognizes the file extension as .docx instead of .bin when the content type is application/octet-stream, you can follow these steps:
-
Extract the Filename: First, try to extract the filename from the URL or the
Content-Dispositionheader. -
Determine MIME Type: Use
mimetypes.guess_type()to determine the MIME type based on the filename and URL. -
Content-Based Detection: If the MIME type is still unknown or generic (
application/octet-stream), use thepython-magiclibrary to perform content-based MIME type detection. -
Adjust File Extension: Ensure the filename has the correct extension. Specifically, if the MIME type indicates a
.docxfile, adjust the extension accordingly.
Here's a Python snippet demonstrating this approach:
import os
import re
import mimetypes
from uuid import uuid4
import magic
def guess_file_info_from_response(response):
url = str(response.url)
parsed_url = urllib.parse.urlparse(url)
url_path = parsed_url.path
filename = os.path.basename(url_path)
if not filename:
content_disposition = response.headers.get("Content-Disposition")
if content_disposition:
filename_match = re.search(r'filename="?(.+)"?', content_disposition)
if filename_match:
filename = filename_match.group(1)
if not filename:
unique_name = str(uuid4())
filename = f"{unique_name}"
mimetype, _ = mimetypes.guess_type(filename)
if mimetype is None:
mimetype, _ = mimetypes.guess_type(url)
if mimetype is None:
mimetype = response.headers.get("Content-Type", "application/octet-stream")
if mimetype == "application/octet-stream" and magic is not None:
try:
mimetype = magic.from_buffer(response.content[:1024], mime=True)
except magic.MagicException:
pass
extension = os.path.splitext(filename)[1]
if not extension:
extension = mimetypes.guess_extension(mimetype) or ".bin"
filename = f"{filename}{extension}"
if extension == ".bin" and mimetype == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
extension = ".docx"
filename = f"{os.path.splitext(filename)[0]}.docx"
return {
"filename": filename,
"extension": extension,
"mimetype": mimetype,
"size": int(response.headers.get("Content-Length", -1)),
}
This code ensures that if the MIME type is detected as application/vnd.openxmlformats-officedocument.wordprocessingml.document, the file extension is set to .docx [1].
To continue talking to Dosu, mention @dosu.
Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Bug Report | Other