Mobile-Agent-v2 依赖版本问题
请问安装完之后运行 run.py 会报错ModuleNotFoundError: OCRDetectionPipeline: No module named 'tf_keras.legacy_tf_layers' 请问和 python 版本有关系吗 或者要怎么解决呢 谢谢
用python3.10试一下,然后重新安装下面的依赖: TensorFlow==2.19.0 keras==3.5.0 tf_slim tf_keras==2.19.0
用python3.10试一下,然后重新安装下面的依赖: TensorFlow==2.19.0 keras==3.5.0 tf_slim tf_keras==2.19.0
我试了一下3.10还是会报 ModuleNotFoundError: No module named 'tf_keras.legacy_tf_layers' 看起来是TensorFlow 2.19.0与OCR检测模型依赖的旧版API不兼容,希望能有解决方式
将ocr替换成 随便啥ocr就行,我用的 easyocr(cursor ai coding的),有更好的选择可以交流下
- 新增:
import cv2
import numpy as np
from MobileAgent.crop import crop_image
import easyocr
def order_point(coor):
arr = np.array(coor).reshape([4, 2])
sum_ = np.sum(arr, 0)
centroid = sum_ / arr.shape[0]
theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
sort_points = arr[np.argsort(theta)]
sort_points = sort_points.reshape([4, -1])
if sort_points[0][0] > centroid[0]:
sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
sort_points = sort_points.reshape([4, 2]).astype('float32')
return sort_points
def longest_common_substring_length(str1, str2):
m = len(str1)
n = len(str2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if str1[i - 1] == str2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
return dp[m][n]
def ocr(image_path, ocr_detection=None, ocr_recognition=None):
"""
使用EasyOCR进行文本检测和识别
保持与原有text_localization.py相同的返回格式
Args:
image_path: 图像路径
ocr_detection: 保持兼容性,不使用
ocr_recognition: 保持兼容性,不使用
Returns:
text_data: List[str] - 检测到的文本列表
coordinate: List[List[int]] - 对应的边界框坐标列表,格式为[x1, y1, x2, y2]
"""
# 初始化EasyOCR,支持中文和英文
reader = easyocr.Reader(['ch_sim', 'en'], gpu=False)
text_data = []
coordinate = []
# 读取图像
image_full = cv2.imread(image_path)
if image_full is None:
return text_data, coordinate
# 使用EasyOCR进行检测和识别
result = reader.readtext(image_full)
if result is None or len(result) == 0:
return text_data, coordinate
# 处理检测结果
for detection in result:
# EasyOCR返回格式: (bbox, text, confidence)
bbox = detection[0] # 边界框坐标 [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
text = detection[1] # 文本内容
confidence = detection[2] # 置信度
if not text or len(text.strip()) == 0:
continue
# 将四边形坐标转换为矩形坐标 [x1, y1, x2, y2]
x_coords = [point[0] for point in bbox]
y_coords = [point[1] for point in bbox]
x1 = int(min(x_coords))
y1 = int(min(y_coords))
x2 = int(max(x_coords))
y2 = int(max(y_coords))
text_data.append(text)
coordinate.append([x1, y1, x2, y2])
return text_data, coordinate
- 替换掉run.py 中import如下
# from MobileAgent.text_localization import ocr
from MobileAgent.text_localization_easyocr import ocr
- 注释掉ocr相关的模型
# ocr_recognition = pipeline(Tasks.ocr_recognition, model='damo/cv_convnextTiny_ocr-recognition-document_damo', device="cpu")
# ocr_detection = pipeline(Tasks.ocr_detection, model='damo/cv_resnet18_ocr-detection-line-level_damo', device="cpu")
TensorFlow==2.9.1 keras==2.9.0 tf_slim tf_keras==2.15.0 加上 python3.9.19 用这个依赖版本是可以的
mac 的话换成tensorflow-macos==2.9