GerapyAutoExtractor icon indicating copy to clipboard operation
GerapyAutoExtractor copied to clipboard

Bug of Gerapy Auto Extractor about similarity2

Open wf4867612 opened this issue 3 years ago • 2 comments

def similarity2(s1, s2): """ get similarity of two strings :param s1: :param s2: :return: """ if not s1 or not s2: return 0 s1_set = set(list(s1)) s2_set = set(list(s2)) intersection = s1_set.intersection(s2_set) union = s1_set.intersection(s2_set) return len(intersection) / len(union)

union = s1_set.intersection(s2_set)    # 这个应该是并集才对吧,源码里边应该是取错了

wf4867612 avatar Aug 25 '22 03:08 wf4867612

我也发现这个问题了

cbc123 avatar Nov 13 '22 14:11 cbc123

from gerapy_auto_extractor.extractors.base import BaseExtractor from lxml.html import HtmlElement, fromstring from gerapy_auto_extractor.patterns.title import METAS from gerapy_auto_extractor.utils.lcs import lcs_of_2 from gerapy_auto_extractor.utils.similarity import similarity2

class TitleExtractor(BaseExtractor): """ Title Extractor which extract title of page """

def extract_by_meta(self, element: HtmlElement) -> str:
    """
    extract according to meta
    :param element:
    :return: str
    """
    for xpath in METAS:
        title = element.xpath(xpath)
        if title:
            return ''.join(title)

def extract_by_title(self, element: HtmlElement):
    """
    get title from <title> tag
    :param element:
    :return:
    """
    return ''.join(element.xpath('//title//text()')).strip()

def extract_by_hs(self, element: HtmlElement):
    """
    get title from all h1-h3 tag
    :param element:
    :return:
    """
    hs = element.xpath('//h1//text()|//h2//text()|//h3//text()')
    return hs or []

def extract_by_h(self, element: HtmlElement):
    """
    extract by h tag, priority h1, h2, h3
    :param elemeent:
    :return:
    """
    for xpath in ['//h1', '//h2', '//h3']:
        children = element.xpath(xpath)
        if not children:
            continue
        child = children[0]
        texts = child.xpath('./text()')
        if texts and len(texts):
            return texts[0].strip()

def process(self, element: HtmlElement):
    """
    extract title from element
    :param element:
    :return:
    """
    title_extracted_by_meta = self.extract_by_meta(element)
    title_extracted_by_h = self.extract_by_h(element)
    title_extracted_by_hs = self.extract_by_hs(element)
    title_extracted_by_title = self.extract_by_title(element)

    # split logic to add more
    if title_extracted_by_meta:
        return title_extracted_by_meta

    title_extracted_by_hs = sorted(title_extracted_by_hs, key=lambda x: len(x), reverse=True)  # 最长字符的标签放前面
    max_dict = {}
    for index, h in enumerate(title_extracted_by_hs):
        jd = similarity2(h, title_extracted_by_title)
        if jd > 0:
            max_dict[index] = int(jd)
    key = 10
    for k, v in max_dict.items():
        if v == max(max_dict.values()):
            key = k
    print(title_extracted_by_hs[key])

    if title_extracted_by_hs:
        return title_extracted_by_hs[key]

    return title_extracted_by_title

title_extractor = TitleExtractor()

def extract_title(html): """ extract title from html :param html: :return: """ result = title_extractor.extract(html) return result

#把title 替换成我这个就解决了

cbc123 avatar Nov 14 '22 07:11 cbc123