Unexpected / Uninformative Error messages with simple input data and two_sample 'wald' or 'lrt'

Open dburkhardt opened this issue 5 years ago • 1 comments

I came across the following errors while using Splatter simulated data.

Error 1 - Infs and NaNs

import numpy as np
import scanpy as sc
import diffxpy.api as de

test_data = np.array([[ 0,  6, 31,  8,  4, 24, 14, 17,  1,  6],
                      [ 0,  2, 25,  1,  0, 18,  4, 31,  3,  4],
                      [ 3,  4, 17, 11,  3, 22,  7, 27,  1, 21],
                      [ 0,  7, 13,  4,  7, 35,  5, 22,  6,  1],
                      [ 0,  5, 19,  0,  7, 24, 12, 23,  1, 15],
                      [ 1,  5, 34,  0,  8, 29, 19, 28,  4,  2],
                      [ 0,  9, 22,  3, 11, 32, 17, 21,  2,  7],
                      [ 0,  4, 26,  4,  5, 26,  6, 25,  5, 16],
                      [ 0,  6, 46,  0,  5, 96, 24, 23,  0, 12],
                      [ 0,  5, 19,  2,  2, 25,  4, 27,  0,  4]])


grouping = np.array([0, 1, 0, 1, 0, 0, 1, 1, 0, 0])
gene_names = np.arange(10)

de.test.two_sample(test_data, gene_names=gene_names, grouping=grouping, test='wald', noise_model="nb")

raises:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-86-7c3866d394b1> in <module>
----> 1 de.test.two_sample(test_data, gene_names=gene_names, grouping=grouping, test='wald', noise_model="nb")

~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in two_sample(data, grouping, as_numeric, test, gene_names, sample_description, noise_model, size_factors, batch_size, backend, train_args, training_strategy, is_sig_zerovar, quick_scale, dtype, **kwargs)
   1028         formula_loc = '~ 1 + grouping'
   1029         formula_scale = '~ 1 + grouping'
-> 1030         de_test = wald(
   1031             data=data,
   1032             factor_loc_totest="grouping",

~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in wald(data, factor_loc_totest, coef_to_test, formula_loc, formula_scale, as_numeric, init_a, init_b, gene_names, sample_description, dmat_loc, dmat_scale, constraints_loc, constraints_scale, noise_model, size_factors, batch_size, backend, train_args, training_strategy, quick_scale, dtype, **kwargs)
    715 
    716     # Fit model.
--> 717     model = _fit(
    718         noise_model=noise_model,
    719         data=data,

~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in _fit(noise_model, data, design_loc, design_scale, design_loc_names, design_scale_names, constraints_loc, constraints_scale, init_model, init_a, init_b, gene_names, size_factors, batch_size, backend, training_strategy, quick_scale, train_args, close_session, dtype)
    220         raise ValueError('backend="%s" not recognized.' % backend)
    221 
--> 222     estim = Estimator(
    223         input_data=input_data,
    224         init_a=init_a,

~/.local/lib/python3.8/site-packages/batchglm/train/numpy/glm_nb/estimator.py in __init__(self, input_data, init_a, init_b, batch_size, quick_scale, dtype, **kwargs)
     72             self._train_scale = False
     73 
---> 74         self.model_vars = ModelVars(
     75             init_a=init_a,
     76             init_b=init_b,

~/.local/lib/python3.8/site-packages/batchglm/train/numpy/base_glm/vars.py in __init__(self, init_a, init_b, constraints_loc, constraints_scale, chunk_size_genes, dtype)
     42 
     43         init_a_clipped = self.np_clip_param(np.asarray(init_a, dtype=dtype), "a_var")
---> 44         init_b_clipped = self.np_clip_param(np.asarray(init_b, dtype=dtype), "b_var")
     45         self.params = dask.array.from_array(np.concatenate(
     46             [

/usr/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
     83 
     84     """
---> 85     return array(a, dtype, copy=False, order=order)
     86 
     87 

~/.local/lib/python3.8/site-packages/dask/array/core.py in __array__(self, dtype, **kwargs)
   1312 
   1313     def __array__(self, dtype=None, **kwargs):
-> 1314         x = self.compute()
   1315         if dtype and x.dtype != dtype:
   1316             x = x.astype(dtype)

~/.local/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
    163         dask.base.compute
    164         """
--> 165         (result,) = compute(self, traverse=False, **kwargs)
    166         return result
    167 

~/.local/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
    434     keys = [x.__dask_keys__() for x in collections]
    435     postcomputes = [x.__dask_postcompute__() for x in collections]
--> 436     results = schedule(dsk, keys, **kwargs)
    437     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    438 

~/.local/lib/python3.8/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
     71                 pools[thread][num_workers] = pool
     72 
---> 73     results = get_async(
     74         pool.apply_async,
     75         len(pool._pool),

~/.local/lib/python3.8/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
    484                         _execute_task(task, data)  # Re-execute locally
    485                     else:
--> 486                         raise_exception(exc, tb)
    487                 res, worker_id = loads(res_info)
    488                 state["cache"][key] = res

~/.local/lib/python3.8/site-packages/dask/local.py in reraise(exc, tb)
    314     if exc.__traceback__ is not tb:
    315         raise exc.with_traceback(tb)
--> 316     raise exc
    317 
    318 

~/.local/lib/python3.8/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
    220     try:
    221         task, data = loads(task_info)
--> 222         result = _execute_task(task, data)
    223         id = get_id()
    224         result = dumps((result, id))

~/.local/lib/python3.8/site-packages/dask/core.py in _execute_task(arg, cache, dsk)
    117         func, args = arg[0], arg[1:]
    118         args2 = [_execute_task(a, cache) for a in args]
--> 119         return func(*args2)
    120     elif not ishashable(arg):
    121         return arg

~/.local/lib/python3.8/site-packages/scipy/linalg/basic.py in solve_triangular(a, b, trans, lower, unit_diagonal, overwrite_b, debug, check_finite)
    334 
    335     a1 = _asarray_validated(a, check_finite=check_finite)
--> 336     b1 = _asarray_validated(b, check_finite=check_finite)
    337     if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]:
    338         raise ValueError('expected square matrix')

~/.local/lib/python3.8/site-packages/scipy/_lib/_util.py in _asarray_validated(a, check_finite, sparse_ok, objects_ok, mask_ok, as_inexact)
    244             raise ValueError('masked arrays are not supported')
    245     toarray = np.asarray_chkfinite if check_finite else np.asarray
--> 246     a = toarray(a)
    247     if not objects_ok:
    248         if a.dtype is np.dtype('O'):

/usr/lib/python3.8/site-packages/numpy/lib/function_base.py in asarray_chkfinite(a, dtype, order)
    496     a = asarray(a, dtype=dtype, order=order)
    497     if a.dtype.char in typecodes['AllFloat'] and not np.isfinite(a).all():
--> 498         raise ValueError(
    499             "array must not contain infs or NaNs")
    500     return a

ValueError: array must not contain infs or NaNs

Now looking at the test_data, it's clear there are no infs or NaNs in the data. I don't know batchglm so I can't really interpret where this issue is coming up.

Error 2: More than one scale parameter with simple data

While hunting for a minimum reproducible example for the above error, I came across the following error that I also don't know how to interpret.

simple_data = np.random.normal(loc=100, size=(100,2))
adata = sc.AnnData(simple_data)
grouping = np.random.choice([0,1], size=simple_data.shape[0])

de.test.two_sample(adata, grouping=grouping, test='wald', noise_model="nb")

raises

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-3-63450a98cfa2> in <module>
      3 grouping = np.random.choice([0,1], size=simple_data.shape[0])
      4 
----> 5 de.test.two_sample(adata, grouping=grouping, test='wald', noise_model="nb")

~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in two_sample(data, grouping, as_numeric, test, gene_names, sample_description, noise_model, size_factors, batch_size, backend, train_args, training_strategy, is_sig_zerovar, quick_scale, dtype, **kwargs)
   1028         formula_loc = '~ 1 + grouping'
   1029         formula_scale = '~ 1 + grouping'
-> 1030         de_test = wald(
   1031             data=data,
   1032             factor_loc_totest="grouping",

~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in wald(data, factor_loc_totest, coef_to_test, formula_loc, formula_scale, as_numeric, init_a, init_b, gene_names, sample_description, dmat_loc, dmat_scale, constraints_loc, constraints_scale, noise_model, size_factors, batch_size, backend, train_args, training_strategy, quick_scale, dtype, **kwargs)
    715 
    716     # Fit model.
--> 717     model = _fit(
    718         noise_model=noise_model,
    719         data=data,

~/.local/lib/python3.8/site-packages/diffxpy/testing/tests.py in _fit(noise_model, data, design_loc, design_scale, design_loc_names, design_scale_names, constraints_loc, constraints_scale, init_model, init_a, init_b, gene_names, size_factors, batch_size, backend, training_strategy, quick_scale, train_args, close_session, dtype)
    220         raise ValueError('backend="%s" not recognized.' % backend)
    221 
--> 222     estim = Estimator(
    223         input_data=input_data,
    224         init_a=init_a,

~/.local/lib/python3.8/site-packages/batchglm/train/numpy/glm_nb/estimator.py in __init__(self, input_data, init_a, init_b, batch_size, quick_scale, dtype, **kwargs)
     87             dtype=dtype
     88         )
---> 89         super(Estimator, self).__init__(
     90             input_data=input_data,
     91             model=model,

~/.local/lib/python3.8/site-packages/batchglm/train/numpy/base_glm/estimator.py in __init__(self, model, input_data, dtype)
     31     ):
     32         if input_data.design_scale.shape[1] != 1:
---> 33             raise ValueError("cannot model more than one scale parameter with numpy backend right now.")
     34         _EstimatorGLM.__init__(
     35             self=self,

ValueError: cannot model more than one scale parameter with numpy backend right now.

I'm wondering if there errors are limited to the two_sample helper function, and I'll check later if I can reproduce using test.wald

Feb 24 '20 18:02 dburkhardt

Hi @dburkhardt, thanks for the detailed issue! Somehow, two_sample escaped the unit tests so far, sorry for that. This error ValueError: cannot model more than one scale parameter with numpy backend right now. was thrown because in two_sample, a default dispersion model was set that is not supported anymore by the new (more stable) GLM backend. I changed this default in diffxpy dev branch, feel free to check it out. The first error was also related to this, at least I cannot reproduce it anymore after including this fix.

Feb 25 '20 09:02 davidsebfischer