Unexpected / Uninformative Error messages with simple input data and two_sample 'wald' or 'lrt'
I came across the following errors while using Splatter simulated data.
Error 1 - Infs and NaNs
import numpy as np
import scanpy as sc
import diffxpy.api as de
test_data = np.array([[ 0, 6, 31, 8, 4, 24, 14, 17, 1, 6],
[ 0, 2, 25, 1, 0, 18, 4, 31, 3, 4],
[ 3, 4, 17, 11, 3, 22, 7, 27, 1, 21],
[ 0, 7, 13, 4, 7, 35, 5, 22, 6, 1],
[ 0, 5, 19, 0, 7, 24, 12, 23, 1, 15],
[ 1, 5, 34, 0, 8, 29, 19, 28, 4, 2],
[ 0, 9, 22, 3, 11, 32, 17, 21, 2, 7],
[ 0, 4, 26, 4, 5, 26, 6, 25, 5, 16],
[ 0, 6, 46, 0, 5, 96, 24, 23, 0, 12],
[ 0, 5, 19, 2, 2, 25, 4, 27, 0, 4]])
grouping = np.array([0, 1, 0, 1, 0, 0, 1, 1, 0, 0])
gene_names = np.arange(10)
de.test.two_sample(test_data, gene_names=gene_names, grouping=grouping, test='wald', noise_model="nb")
raises:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-86-7c3866d394b1> in <module>
----> 1 de.test.two_sample(test_data, gene_names=gene_names, grouping=grouping, test='wald', noise_model="nb")
~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in two_sample(data, grouping, as_numeric, test, gene_names, sample_description, noise_model, size_factors, batch_size, backend, train_args, training_strategy, is_sig_zerovar, quick_scale, dtype, **kwargs)
1028 formula_loc = '~ 1 + grouping'
1029 formula_scale = '~ 1 + grouping'
-> 1030 de_test = wald(
1031 data=data,
1032 factor_loc_totest="grouping",
~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in wald(data, factor_loc_totest, coef_to_test, formula_loc, formula_scale, as_numeric, init_a, init_b, gene_names, sample_description, dmat_loc, dmat_scale, constraints_loc, constraints_scale, noise_model, size_factors, batch_size, backend, train_args, training_strategy, quick_scale, dtype, **kwargs)
715
716 # Fit model.
--> 717 model = _fit(
718 noise_model=noise_model,
719 data=data,
~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in _fit(noise_model, data, design_loc, design_scale, design_loc_names, design_scale_names, constraints_loc, constraints_scale, init_model, init_a, init_b, gene_names, size_factors, batch_size, backend, training_strategy, quick_scale, train_args, close_session, dtype)
220 raise ValueError('backend="%s" not recognized.' % backend)
221
--> 222 estim = Estimator(
223 input_data=input_data,
224 init_a=init_a,
~/.local/lib/python3.8/site-packages/batchglm/train/numpy/glm_nb/estimator.py in __init__(self, input_data, init_a, init_b, batch_size, quick_scale, dtype, **kwargs)
72 self._train_scale = False
73
---> 74 self.model_vars = ModelVars(
75 init_a=init_a,
76 init_b=init_b,
~/.local/lib/python3.8/site-packages/batchglm/train/numpy/base_glm/vars.py in __init__(self, init_a, init_b, constraints_loc, constraints_scale, chunk_size_genes, dtype)
42
43 init_a_clipped = self.np_clip_param(np.asarray(init_a, dtype=dtype), "a_var")
---> 44 init_b_clipped = self.np_clip_param(np.asarray(init_b, dtype=dtype), "b_var")
45 self.params = dask.array.from_array(np.concatenate(
46 [
/usr/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
83
84 """
---> 85 return array(a, dtype, copy=False, order=order)
86
87
~/.local/lib/python3.8/site-packages/dask/array/core.py in __array__(self, dtype, **kwargs)
1312
1313 def __array__(self, dtype=None, **kwargs):
-> 1314 x = self.compute()
1315 if dtype and x.dtype != dtype:
1316 x = x.astype(dtype)
~/.local/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
163 dask.base.compute
164 """
--> 165 (result,) = compute(self, traverse=False, **kwargs)
166 return result
167
~/.local/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
434 keys = [x.__dask_keys__() for x in collections]
435 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 436 results = schedule(dsk, keys, **kwargs)
437 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
438
~/.local/lib/python3.8/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
71 pools[thread][num_workers] = pool
72
---> 73 results = get_async(
74 pool.apply_async,
75 len(pool._pool),
~/.local/lib/python3.8/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
484 _execute_task(task, data) # Re-execute locally
485 else:
--> 486 raise_exception(exc, tb)
487 res, worker_id = loads(res_info)
488 state["cache"][key] = res
~/.local/lib/python3.8/site-packages/dask/local.py in reraise(exc, tb)
314 if exc.__traceback__ is not tb:
315 raise exc.with_traceback(tb)
--> 316 raise exc
317
318
~/.local/lib/python3.8/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
220 try:
221 task, data = loads(task_info)
--> 222 result = _execute_task(task, data)
223 id = get_id()
224 result = dumps((result, id))
~/.local/lib/python3.8/site-packages/dask/core.py in _execute_task(arg, cache, dsk)
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
~/.local/lib/python3.8/site-packages/scipy/linalg/basic.py in solve_triangular(a, b, trans, lower, unit_diagonal, overwrite_b, debug, check_finite)
334
335 a1 = _asarray_validated(a, check_finite=check_finite)
--> 336 b1 = _asarray_validated(b, check_finite=check_finite)
337 if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]:
338 raise ValueError('expected square matrix')
~/.local/lib/python3.8/site-packages/scipy/_lib/_util.py in _asarray_validated(a, check_finite, sparse_ok, objects_ok, mask_ok, as_inexact)
244 raise ValueError('masked arrays are not supported')
245 toarray = np.asarray_chkfinite if check_finite else np.asarray
--> 246 a = toarray(a)
247 if not objects_ok:
248 if a.dtype is np.dtype('O'):
/usr/lib/python3.8/site-packages/numpy/lib/function_base.py in asarray_chkfinite(a, dtype, order)
496 a = asarray(a, dtype=dtype, order=order)
497 if a.dtype.char in typecodes['AllFloat'] and not np.isfinite(a).all():
--> 498 raise ValueError(
499 "array must not contain infs or NaNs")
500 return a
ValueError: array must not contain infs or NaNs
Now looking at the test_data, it's clear there are no infs or NaNs in the data. I don't know batchglm so I can't really interpret where this issue is coming up.
Error 2: More than one scale parameter with simple data
While hunting for a minimum reproducible example for the above error, I came across the following error that I also don't know how to interpret.
simple_data = np.random.normal(loc=100, size=(100,2))
adata = sc.AnnData(simple_data)
grouping = np.random.choice([0,1], size=simple_data.shape[0])
de.test.two_sample(adata, grouping=grouping, test='wald', noise_model="nb")
raises
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-3-63450a98cfa2> in <module>
3 grouping = np.random.choice([0,1], size=simple_data.shape[0])
4
----> 5 de.test.two_sample(adata, grouping=grouping, test='wald', noise_model="nb")
~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in two_sample(data, grouping, as_numeric, test, gene_names, sample_description, noise_model, size_factors, batch_size, backend, train_args, training_strategy, is_sig_zerovar, quick_scale, dtype, **kwargs)
1028 formula_loc = '~ 1 + grouping'
1029 formula_scale = '~ 1 + grouping'
-> 1030 de_test = wald(
1031 data=data,
1032 factor_loc_totest="grouping",
~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in wald(data, factor_loc_totest, coef_to_test, formula_loc, formula_scale, as_numeric, init_a, init_b, gene_names, sample_description, dmat_loc, dmat_scale, constraints_loc, constraints_scale, noise_model, size_factors, batch_size, backend, train_args, training_strategy, quick_scale, dtype, **kwargs)
715
716 # Fit model.
--> 717 model = _fit(
718 noise_model=noise_model,
719 data=data,
~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in _fit(noise_model, data, design_loc, design_scale, design_loc_names, design_scale_names, constraints_loc, constraints_scale, init_model, init_a, init_b, gene_names, size_factors, batch_size, backend, training_strategy, quick_scale, train_args, close_session, dtype)
220 raise ValueError('backend="%s" not recognized.' % backend)
221
--> 222 estim = Estimator(
223 input_data=input_data,
224 init_a=init_a,
~/.local/lib/python3.8/site-packages/batchglm/train/numpy/glm_nb/estimator.py in __init__(self, input_data, init_a, init_b, batch_size, quick_scale, dtype, **kwargs)
87 dtype=dtype
88 )
---> 89 super(Estimator, self).__init__(
90 input_data=input_data,
91 model=model,
~/.local/lib/python3.8/site-packages/batchglm/train/numpy/base_glm/estimator.py in __init__(self, model, input_data, dtype)
31 ):
32 if input_data.design_scale.shape[1] != 1:
---> 33 raise ValueError("cannot model more than one scale parameter with numpy backend right now.")
34 _EstimatorGLM.__init__(
35 self=self,
ValueError: cannot model more than one scale parameter with numpy backend right now.
I'm wondering if there errors are limited to the two_sample helper function, and I'll check later if I can reproduce using test.wald
Hi @dburkhardt, thanks for the detailed issue! Somehow, two_sample escaped the unit tests so far, sorry for that. This error ValueError: cannot model more than one scale parameter with numpy backend right now. was thrown because in two_sample, a default dispersion model was set that is not supported anymore by the new (more stable) GLM backend. I changed this default in diffxpy dev branch, feel free to check it out. The first error was also related to this, at least I cannot reproduce it anymore after including this fix.