pymorphy2
pymorphy2 copied to clipboard
Pymorphy2 fails to parse Unicode characters with no name defined
Pymorphy2 fails to parse Unicode characters with no name defined. Minimal example:
import pymorphy2
pymorphy2.MorphAnalyzer().parse('𘐐')
Traceback:
KeyError Traceback (most recent call last)
~/project/.venv/lib/python3.8/site-packages/pymorphy2/shapes.py in is_latin_char(uchr)
12 try:
---> 13 return _latin_letters_cache[uchr]
14 except KeyError:
KeyError: '𘐐'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-47-a1997180ffcd> in <module>
1 import pymorphy2
2
----> 3 pymorphy2.MorphAnalyzer().parse('𘐐')
~/project/.venv/lib/python3.8/site-packages/pymorphy2/analyzer.py in parse(self, word)
313
314 for analyzer, is_terminal in self._units:
--> 315 res.extend(analyzer.parse(word, word_lower, seen))
316
317 if is_terminal and res:
~/project/.venv/lib/python3.8/site-packages/pymorphy2/units/by_shape.py in parse(self, word, word_lower, seen_parses)
25
26 def parse(self, word, word_lower, seen_parses):
---> 27 shape = self.check_shape(word, word_lower)
28 if not shape:
29 return []
~/project/.venv/lib/python3.8/site-packages/pymorphy2/units/by_shape.py in check_shape(self, word, word_lower)
89
90 def check_shape(self, word, word_lower):
---> 91 return is_latin(word)
92
93
~/project/.venv/lib/python3.8/site-packages/pymorphy2/shapes.py in is_latin(token)
38 return (
39 any(ch.isalpha() for ch in token) and
---> 40 all(is_latin_char(ch) for ch in token if ch.isalpha())
41 )
42
~/project/.venv/lib/python3.8/site-packages/pymorphy2/shapes.py in <genexpr>(.0)
38 return (
39 any(ch.isalpha() for ch in token) and
---> 40 all(is_latin_char(ch) for ch in token if ch.isalpha())
41 )
42
~/project/.venv/lib/python3.8/site-packages/pymorphy2/shapes.py in is_latin_char(uchr)
15 if isinstance(uchr, bytes):
16 uchr = uchr.decode('ascii')
---> 17 is_latin = 'LATIN' in unicodedata.name(uchr)
18 return _latin_letters_cache.setdefault(uchr, is_latin)
19
ValueError: no such name