UnicodeDecodeError

Open pratapbhanu opened this issue 12 years ago • 0 comments

When I user p = ttp.Parser(); ttp.parse(tweettext, html=False); I get exceptions for some tweets due to invalid html character formatting as :

Traceback (most recent call last): File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 34, in extract_tweet_tags("twitter12051154249.txt") File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 28, in extract_tweet_tags result = ttp.Parser().parse(tweet.strip(), html=True)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 131, in parse parsed_html = self._html(text) if html else self._text(text) File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 148, in _html return HASHTAG_REGEX.sub(self._parse_tags, html) File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 249, in _parse_tags return '%s%s' % (pre, self.format_tag(tag, text)) File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 270, in format_tag % (urllib.quote('#' + text.encode('utf-8')), tag, text) File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode return codecs.utf_8_decode(input, errors, True) UnicodeDecodeError: 'utf8' codec can't decode byte 0xa5 in position 0: invalid start byte

I fixed it by adding a line in the following methods:

def parse(self, text, html=True): '''Parse the text and return a ParseResult instance.''' self._urls = [] self._users = [] self._lists = [] self._tags = [] self._is_html = html #added to fix a bug

    reply = REPLY_REGEX.match(text)
    reply = reply.groups(0)[0] if reply is not None else None

    parsed_html = self._html(text) if html else self._text(text)
    return ParseResult(self._urls, self._users, reply,
                       self._lists, self._tags, parsed_html)

def _parse_tags(self, match): '''Parse hashtags.'''

    mat = match.group(0)

    # Fix problems with the regex capturing stuff infront of the #
    tag = None
    for i in u'#\uff03':
        pos = mat.rfind(i)
        if pos != -1:
            tag = i
            break

    pre, text = mat[:pos], mat[pos + 1:]
    if self._include_spans:
        span = match.span(0)
        # add an offset if pre is e.g. ' '
        span = (span[0] + len(pre), span[1])
        self._tags.append((text, span))
    else:
        self._tags.append(text)

    if  self._is_html: #self._html:  changed to fix a bug
        return '%s%s' % (pre, self.format_tag(tag, text))

Jun 05 '13 10:06 pratapbhanu