datajoint-python Read an empty cell array blob fails

Bug Report

Description

Reading an empty cell array inserted with mym MATLAB fails to be read in Datajoint python

Reproducibility

OS (WIN (MATLAB) & MACOS (Python)
Python Version (3.7) & MATLAB Version (2019b)
MySQL Version (10.2.33-MariaDB)
DataJoint Version (0.13.7)

I have a corner case for reading some special. blobs in Datajoint Python when these are stored with mym Matlab: Here is the type of blob stored in the DB and read on Matlab:

la = bdata('select protocol_data from bdata.sessions where sessid=889527');
la{1}.crash_comments
ans =
  3×1 cell array
    {0×0 double}
    {0×0 double}
    {0×0 double}

As you can see, what is stored in a part of the blob is a 3x1 cell array composed of empty items:

When trying to read this data in Python, I got this error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/sg/5bw1t8p11nx09k7kmytnmfb40000gp/T/ipykernel_31219/4072205635.py in <module>
      3 session_key = {'sessid': 889527}
      4 # session_key = {'sessid': 889664}
----> 5 session_data = (bdata.Sessions & session_key).fetch('protocol_data', as_dict=True)
      6 parsed_events = (bdata.ParsedEvents & session_key).fetch(as_dict=True)

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/fetch.py in __call__(self, offset, limit, order_by, format, as_dict, squeeze, download_path, *attrs)
    234                 squeeze=squeeze,
    235                 download_path=download_path,
--> 236                 format="array",
    237             )
    238             if attrs_as_dict:

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/fetch.py in __call__(self, offset, limit, order_by, format, as_dict, squeeze, download_path, *attrs)
    287                 for name in heading:
    288                     # unpack blobs and externals
--> 289                     ret[name] = list(map(partial(get, heading[name]), ret[name]))
    290                 if format == "frame":
    291                     ret = pandas.DataFrame(ret).set_index(heading.primary_key)

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/fetch.py in _get(connection, attr, data, squeeze, download_path)
    112                 squeeze=squeeze,
    113             )
--> 114             if attr.is_blob
    115             else data
    116         )

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/blob.py in unpack(blob, squeeze)
    619         return blob
    620     if blob is not None:
--> 621         return Blob(squeeze=squeeze).unpack(blob)

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/blob.py in unpack(self, blob)
    127         blob_format = self.read_zero_terminated_string()
    128         if blob_format in ("mYm", "dj0"):
--> 129             return self.read_blob(n_bytes=len(self._blob) - self._pos)
    130 
    131     def read_blob(self, n_bytes=None):

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/blob.py in read_blob(self, n_bytes)
    161                 % data_structure_code
    162             )
--> 163         v = call()
    164         if n_bytes is not None and self._pos - start != n_bytes:
    165             raise DataJointError("Blob length check failed! Invalid blob")

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/blob.py in read_struct(self)
    463                 self.read_blob(n_bytes=int(self.read_value())) for _ in range(n_fields)
    464             )
--> 465             for __ in range(n_elem)
    466         ]
    467 

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/blob.py in <listcomp>(.0)
    463                 self.read_blob(n_bytes=int(self.read_value())) for _ in range(n_fields)
    464             )
--> 465             for __ in range(n_elem)
    466         ]
    467 

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/blob.py in <genexpr>(.0)
    461         raw_data = [
    462             tuple(
--> 463                 self.read_blob(n_bytes=int(self.read_value())) for _ in range(n_fields)
    464             )
    465             for __ in range(n_elem)

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/blob.py in read_blob(self, n_bytes)
    161                 % data_structure_code
    162             )
--> 163         v = call()
    164         if n_bytes is not None and self._pos - start != n_bytes:
    165             raise DataJointError("Blob length check failed! Invalid blob")

~/opt/anaconda3/envs/bl_pipeline_python_env/lib/python3.7/site-packages/datajoint/blob.py in read_cell_array(self)
    508         return (
    509             self.squeeze(
--> 510                 np.array(result).reshape(shape, order="F"), convert_to_scalar=False
    511             )
    512         ).view(MatCell)

ValueError: cannot reshape array of size 0 into shape (3,1)

I have “patched” the blob.py code read_cell_array function with:

        if result.size == 0:
            return (
            self.squeeze(
                np.array(np.empty(shape, dtype=type(result[0]))), convert_to_scalar=False
            )
        ).view(MatCell)
        else:
            return (
            self.squeeze(
                np.array(result).reshape(shape, order="F"), convert_to_scalar=False
            )
        ).view(MatCell)

Just to add the case that the size of the array is zero (numpy array size is 0 if it’s filled with empty arrays) Probably not the cleanest way to do it.

Expected Behavior

To get something similar to this when reading this kind of blobs:

session_data['crash_comments']
MatCell([[None],
         [None],
         [None]], dtype=object)

Sep 29 '22 17:09 Alvalunasan

Thank you for submitting this @Alvalunasan. I think I understand the problem.

Sep 29 '22 17:09 dimitri-yatsenko

Hi @dimitri-yatsenko In brodylab this error has resurfaced with the new datajoint integration. Is it possible that this gets merged ? (or the bug fixed somehow) ?

Thank you very much for your help

Jun 06 '24 18:06 Alvalunasan

Hi @dimitri-yatsenko , a note that this is not restricted to old Matlabs-- it was happening with a recent Matlab , and on data from August 2023. Maybe newer data too, I haven't yet checked on newer data.

Would it be appropriate to merge Alvaro's patch?

Jun 06 '24 19:06 carlosbrody

@Alvalunasan updates his patch and suggests replacing lines 495-499 with

    sizes_array = [x.size for x in result]
    sum_sizes = sum(sizes_array)

    if n_elem ==0:
        return np.array(np.empty(0)).view(MatCell)
    elif sum_sizes == 0:
        return (self.squeeze(np.array(np.empty(shape, dtype=type(result[0]))), convert_to_scalar=False)).view(MatCell)
    else:
        return (self.squeeze(np.array(result).reshape(shape, order="F"), convert_to_scalar=False)).view(MatCell)

Jun 06 '24 22:06 carlosbrody

ok, will incorporate asap. We are starting to work on a new release. Thanks.

Jun 06 '24 23:06 dimitri-yatsenko

See next coment

Jun 08 '24 21:06 Alvalunasan

@dimitri-yatsenko , @carlosbrody

New corner case for the function (Cell Matrix reading, instead of only nx1 vectors):

  def read_cell_array(self):
       """deserialize MATLAB cell array"""
       load_as_object = False
       n_dims = self.read_value()
       shape = self.read_value(count=n_dims)
       n_elem = int(np.prod(shape))
       result = [self.read_blob(n_bytes=self.read_value()) for _ in range(n_elem)]

       # If it is a matrix (and not a nx1 vector) load as object
       if np.sum(shape > 1) > 1:
           load_as_object = True

       # Check size for each element (could have Empty elements in vector)
       if n_elem > 0:
           # If there are arrays, tuple or list inside elements of result, load as object (except if all emptys)
           if isinstance(result[0], np.ndarray):
               sizes_array = [x.size for x in result]
               sum_sizes = sum(sizes_array)
               load_as_object = True
           elif isinstance(result[0], tuple) or isinstance(result[0], list):
               sizes_array = [len(x) for x in result]
               sum_sizes = sum(sizes_array)
               load_as_object = True
           else:
               sum_sizes = n_elem

       # If no trials in array
       if n_elem ==0:
           return np.array(np.empty(0)).view(MatCell)
       # If all trials contains "empty" data
       elif sum_sizes == 0:
           return (self.squeeze(np.array(np.empty(shape, dtype=type(result[0]))), convert_to_scalar=False)).view(MatCell)
       # If some trials contains data and others contains "empty" data
       elif sum_sizes != n_elem or load_as_object:
           return (self.squeeze(np.array(result, dtype='object').reshape(shape, order="F"), convert_to_scalar=False)).view(MatCell)
       # Regular case, all trials contains data
       else:
           return (self.squeeze(np.array(result).reshape(shape, order="F"), convert_to_scalar=False)).view(MatCell)

Jun 10 '24 22:06 Alvalunasan

Excellent. I am traveling until next week and will work on this when I return. Thank you so much for this solution.

Jun 11 '24 15:06 dimitri-yatsenko