Bug of Gerapy Auto Extractor about similarity2
def similarity2(s1, s2): """ get similarity of two strings :param s1: :param s2: :return: """ if not s1 or not s2: return 0 s1_set = set(list(s1)) s2_set = set(list(s2)) intersection = s1_set.intersection(s2_set) union = s1_set.intersection(s2_set) return len(intersection) / len(union)
union = s1_set.intersection(s2_set) # 这个应该是并集才对吧,源码里边应该是取错了
我也发现这个问题了
from gerapy_auto_extractor.extractors.base import BaseExtractor from lxml.html import HtmlElement, fromstring from gerapy_auto_extractor.patterns.title import METAS from gerapy_auto_extractor.utils.lcs import lcs_of_2 from gerapy_auto_extractor.utils.similarity import similarity2
class TitleExtractor(BaseExtractor): """ Title Extractor which extract title of page """
def extract_by_meta(self, element: HtmlElement) -> str:
"""
extract according to meta
:param element:
:return: str
"""
for xpath in METAS:
title = element.xpath(xpath)
if title:
return ''.join(title)
def extract_by_title(self, element: HtmlElement):
"""
get title from <title> tag
:param element:
:return:
"""
return ''.join(element.xpath('//title//text()')).strip()
def extract_by_hs(self, element: HtmlElement):
"""
get title from all h1-h3 tag
:param element:
:return:
"""
hs = element.xpath('//h1//text()|//h2//text()|//h3//text()')
return hs or []
def extract_by_h(self, element: HtmlElement):
"""
extract by h tag, priority h1, h2, h3
:param elemeent:
:return:
"""
for xpath in ['//h1', '//h2', '//h3']:
children = element.xpath(xpath)
if not children:
continue
child = children[0]
texts = child.xpath('./text()')
if texts and len(texts):
return texts[0].strip()
def process(self, element: HtmlElement):
"""
extract title from element
:param element:
:return:
"""
title_extracted_by_meta = self.extract_by_meta(element)
title_extracted_by_h = self.extract_by_h(element)
title_extracted_by_hs = self.extract_by_hs(element)
title_extracted_by_title = self.extract_by_title(element)
# split logic to add more
if title_extracted_by_meta:
return title_extracted_by_meta
title_extracted_by_hs = sorted(title_extracted_by_hs, key=lambda x: len(x), reverse=True) # 最长字符的标签放前面
max_dict = {}
for index, h in enumerate(title_extracted_by_hs):
jd = similarity2(h, title_extracted_by_title)
if jd > 0:
max_dict[index] = int(jd)
key = 10
for k, v in max_dict.items():
if v == max(max_dict.values()):
key = k
print(title_extracted_by_hs[key])
if title_extracted_by_hs:
return title_extracted_by_hs[key]
return title_extracted_by_title
title_extractor = TitleExtractor()
def extract_title(html): """ extract title from html :param html: :return: """ result = title_extractor.extract(html) return result
#把title 替换成我这个就解决了