Bug in oov_share calculation:
In evidently/features/OOV_words_percentage_feature.py you have a bug in oov_share calculation;
If a length of a string is zero or None you will get an error:
File /usr/local/lib/python3.8/dist-packages/evidently/test_suite/test_suite.py:104, in TestSuite.run(self, reference_data, current_data, column_mapping)
102 self._add_tests_from_generator(test_generator)
103 self._inner_suite.verify()
--> 104 curr_add, ref_add = self._inner_suite.create_additional_features(current_data, reference_data, data_definition)
105 data = InputData(reference_data, current_data, ref_add, curr_add, column_mapping, data_definition)
107 self._inner_suite.run_calculate(data)
File /usr/local/lib/python3.8/dist-packages/evidently/suite/base_suite.py:358, in Suite.create_additional_features(self, current_data, reference_data, data_definition)
356 continue
357 features[_id] = feature
--> 358 feature_data = feature.generate_feature(current_data, data_definition)
359 feature_data.columns = [f"{feature.__class__.__name__}.{old}" for old in feature_data.columns]
360 if curr_additional_data is None:
File /usr/local/lib/python3.8/dist-packages/evidently/features/OOV_words_percentage_feature.py:44, in OOVWordsPercentage.generate_feature(self, data, data_definition)
36 oov_num += 1
37 return 100 * oov_num / len(words_)
39 return pd.DataFrame(
40 dict(
41 [
42 (
43 self.column_name,
---> 44 data[self.column_name].apply(lambda x: oov_share(x, ignore_words=self.ignore_words)),
45 )
46 ]
47 )
48 )
File /usr/local/lib/python3.8/dist-packages/pandas/core/series.py:4630, in Series.apply(self, func, convert_dtype, args, **kwargs)
4520 def apply(
4521 self,
4522 func: AggFuncType,
(...)
4525 **kwargs,
4526 ) -> DataFrame | Series:
4527 """
4528 Invoke function on values of Series.
4529
(...)
4628 dtype: float64
4629 """
-> 4630 return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
File /usr/local/lib/python3.8/dist-packages/pandas/core/apply.py:1025, in SeriesApply.apply(self)
1022 return self.apply_str()
1024 # self.f is Callable
-> 1025 return self.apply_standard()
File /usr/local/lib/python3.8/dist-packages/pandas/core/apply.py:1076, in SeriesApply.apply_standard(self)
1074 else:
1075 values = obj.astype(object)._values
-> 1076 mapped = lib.map_infer(
1077 values,
1078 f,
1079 convert=self.convert_dtype,
1080 )
1082 if len(mapped) and isinstance(mapped[0], ABCSeries):
1083 # GH#43986 Need to do list(mapped) in order to get treated as nested
1084 # See also GH#25959 regarding EA support
1085 return obj._constructor_expanddim(list(mapped), index=obj.index)
File /usr/local/lib/python3.8/dist-packages/pandas/_libs/lib.pyx:2834, in pandas._libs.lib.map_infer()
File /usr/local/lib/python3.8/dist-packages/evidently/features/OOV_words_percentage_feature.py:44, in OOVWordsPercentage.generate_feature.<locals>.<lambda>(x)
36 oov_num += 1
37 return 100 * oov_num / len(words_)
39 return pd.DataFrame(
40 dict(
41 [
42 (
43 self.column_name,
---> 44 data[self.column_name].apply(lambda x: oov_share(x, ignore_words=self.ignore_words)),
45 )
46 ]
47 )
48 )
File /usr/local/lib/python3.8/dist-packages/evidently/features/OOV_words_percentage_feature.py:37, in OOVWordsPercentage.generate_feature.<locals>.oov_share(s, ignore_words)
35 if word.lower() not in ignore_words and self._lem.lemmatize(word.lower()) not in self._eng_words:
36 oov_num += 1
---> 37 return 100 * oov_num / len(words_)
ZeroDivisionError: division by zero
Specifically, it seems like a problem is in lines 33-37 in file In evidently/features/OOV_words_percentage_feature.py
my code:
reference = pd.DataFrame(dataset['test'])
new = pd.DataFrame(dataset['train'])
column_mapping = ColumnMapping(
categorical_features=['string_column_1', 'string_column_2', 'string_column_3','label_int'],
text_features=['subject', 'text']
)
data_quality = TestSuite(tests=[
DataQualityTestPreset(),
])
data_quality.run(reference_data=reference, current_data=new,column_mapping=column_mapping)
Hi @danilyef ,
Thank you for reporting the bug!
@emeli-dral
Bug is very easy to fix: I created a pull request https://github.com/evidentlyai/evidently/pull/651 Can you/other developers look at my pull request and merge it, if it looks good?
@emeli-dral I have seen the results of tests of my pull-request: For all versions except python 3.7 there is an error:
sklearn.utils._param_validation.InvalidParameterError: The 'as_frame
Can you have a look at it?
Hi @danilyef
Thanks for the contribution!
I can see that the tests are failing for Python 3.8 and above. This is connected to the scikit-learn version, we are going to fix it.
We are working on a new feature in a separate branch - we will review and merge your request together with it to make sure we solve this conflict version.
It should happen at some point next week!