kerchunk icon indicating copy to clipboard operation
kerchunk copied to clipboard

Error with combining kerchunk mappings with MultiZarrToZarr

Open John-Ragland opened this issue 1 year ago • 2 comments

I have many *.nc files in a directory that I am opening into a single dataset with kerchunk. The nc files are shape (N,1), where dimension of shape 1 is the dimension I want to concat along. Here is a minimally reproducible example:

from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
import xarray as xr
import numpy as np

# create toy data
ds1 = xr.Dataset({'A':xr.DataArray(np.random.rand(1000,1), dims=['x','y'], coords={'x':np.arange(1000),'y':[0]})})
ds2 = xr.Dataset({'A':xr.DataArray(np.random.rand(1000,1), dims=['x','y'], coords={'x':np.arange(1000),'y':[1]})})

ds1.to_netcdf('ds1.nc')
ds2.to_netcdf('ds2.nc')

# create kerchunk mapping
mappings = [SingleHdf5ToZarr('ds1.nc').translate(), SingleHdf5ToZarr('ds2.nc').translate()]

# open dataset with xarray
mzz = MultiZarrToZarr(mappings, concat_dims=['y'], identical_dims=['x']).translate()
so = {'fo': mzz}

ds = xr.open_dataset(
    "reference://", engine="zarr", backend_kwargs={"consolidated": False, "storage_options": so}
).chunk({'y':1})

This works and if I call ds['A'].values, everything is loaded correctly, but if I try to load a single slice in the y dimension (i.e. ds['A'][:,0].values, I get the error:

ValueError: could not broadcast input array from shape (1000,1) into shape (1000,)

If I call ds['A'][:,:1], (preserving the y dimension with shape one), it works as expected.

Here is the contents of the mzz variable:

[{'version': 1,
  'refs': {'.zgroup': '{"zarr_format":2}',
   'A/.zarray': '{"chunks":[1000,1],"compressor":null,"dtype":"<f8","fill_value":"NaN","filters":null,"order":"C","shape":[1000,1],"zarr_format":2}',
   'A/.zattrs': '{"_ARRAY_DIMENSIONS":["x","y"]}',
   'A/0.0': ['ds1.nc', 17256, 8000],
   'x/.zarray': '{"chunks":[1000],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1000],"zarr_format":2}',
   'x/.zattrs': '{"_ARRAY_DIMENSIONS":["x"]}',
   'x/0': ['ds1.nc', 983, 8000],
   'y/.zarray': '{"chunks":[1],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1],"zarr_format":2}',
   'y/.zattrs': '{"_ARRAY_DIMENSIONS":["y"]}',
   'y/0': '\x00\x00\x00\x00\x00\x00\x00\x00'}},
 {'version': 1,
  'refs': {'.zgroup': '{"zarr_format":2}',
   'A/.zarray': '{"chunks":[1000,1],"compressor":null,"dtype":"<f8","fill_value":"NaN","filters":null,"order":"C","shape":[1000,1],"zarr_format":2}',
   'A/.zattrs': '{"_ARRAY_DIMENSIONS":["x","y"]}',
   'A/0.0': ['ds2.nc', 17256, 8000],
   'x/.zarray': '{"chunks":[1000],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1000],"zarr_format":2}',
   'x/.zattrs': '{"_ARRAY_DIMENSIONS":["x"]}',
   'x/0': ['ds2.nc', 983, 8000],
   'y/.zarray': '{"chunks":[1],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1],"zarr_format":2}',
   'y/.zattrs': '{"_ARRAY_DIMENSIONS":["y"]}',
   'y/0': '\x01\x00\x00\x00\x00\x00\x00\x00'}}]

I'm currently running:

  • python: 3.12.4
  • xarray: 2024.6.0
  • kerchunk: 0.2.6
  • zarr: 2.18.2
  • numpy: 2.0.0

Thanks!

John-Ragland avatar Jul 24 '24 21:07 John-Ragland

Here is the contents of the mzz variable:

That is the not the content of the mzz variable. It is the list of kerchunk'd references

Anu-Ra-g avatar Jul 25 '24 07:07 Anu-Ra-g

This has come up before, I'll try to find it.

For completeness, here is what mzz looks like for me:

{'version': 1,
 'refs': {'.zgroup': '{"zarr_format":2}',
  'y/.zarray': '{\n    "chunks": [\n        2\n    ],\n    "compressor": null,\n    "dtype": "<i8",\n    "fill_value": null,\n    "filters": null,\n    "order": "C",\n    "shape": [\n        2\n    ],\n    "zarr_format": 2\n}',
  'y/0': '\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00',
  'y/.zattrs': '{\n    "_ARRAY_DIMENSIONS": [\n        "y"\n    ]\n}',
  'A/.zarray': '{"chunks":[1000,1],"compressor":null,"dtype":"<f8","fill_value":"NaN","filters":null,"order":"C","shape":[1000,2],"zarr_format":2}',
  'A/.zattrs': '{"_ARRAY_DIMENSIONS":["x","y"]}',
  'A/0.0': ['ds1.nc', 17256, 8000],
  'x/.zarray': '{"chunks":[1000],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1000],"zarr_format":2}',
  'x/.zattrs': '{"_ARRAY_DIMENSIONS":["x"]}',
  'x/0': ['ds1.nc', 983, 8000],
  'A/0.1': ['ds2.nc', 17256, 8000]}}

and here is the exception

File ~/conda/envs/py310/lib/python3.10/site-packages/xarray/core/indexing.py:554, in LazilyIndexedArray.get_duck_array(self)
    553 def get_duck_array(self):
--> 554     array = self.array[self.key]
    555     # self.array[self.key] is now a numpy array when
    556     # self.array is a BackendArray subclass
    557     # and self.key is BasicIndexer((slice(None, None, None),))
    558     # so we need the explicit check for ExplicitlyIndexed
    559     if isinstance(array, ExplicitlyIndexed):

File ~/conda/envs/py310/lib/python3.10/site-packages/xarray/backends/zarr.py:94, in ZarrArrayWrapper.__getitem__(self, key)
     92 array = self.get_array()
     93 if isinstance(key, indexing.BasicIndexer):
---> 94     return array[key.tuple]
     95 elif isinstance(key, indexing.VectorizedIndexer):
     96     return array.vindex[
     97         indexing._arrayize_vectorized_indexer(key, self.shape).tuple
     98     ]

File ~/code/zarr/zarr/core.py:798, in Array.__getitem__(self, selection)
    796     result = self.vindex[selection]
    797 elif is_pure_orthogonal_indexing(pure_selection, self.ndim):
--> 798     result = self.get_orthogonal_selection(pure_selection, fields=fields)
    799 else:
    800     result = self.get_basic_selection(pure_selection, fields=fields)

File ~/code/zarr/zarr/core.py:1080, in Array.get_orthogonal_selection(self, selection, out, fields)
   1077 # setup indexer
   1078 indexer = OrthogonalIndexer(selection, self)
-> 1080 return self._get_selection(indexer=indexer, out=out, fields=fields)

File ~/code/zarr/zarr/core.py:1343, in Array._get_selection(self, indexer, out, fields)
   1340 if math.prod(out_shape) > 0:
   1341     # allow storage to get multiple items at once
   1342     lchunk_coords, lchunk_selection, lout_selection = zip(*indexer)
-> 1343     self._chunk_getitems(
   1344         lchunk_coords,
   1345         lchunk_selection,
   1346         out,
   1347         lout_selection,
   1348         drop_axes=indexer.drop_axes,
   1349         fields=fields,
   1350     )
   1351 if out.shape:
   1352     return out

File ~/code/zarr/zarr/core.py:2183, in Array._chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection, drop_axes, fields)
   2181 for ckey, chunk_select, out_select in zip(ckeys, lchunk_selection, lout_selection):
   2182     if ckey in cdatas:
-> 2183         self._process_chunk(
   2184             out,
   2185             cdatas[ckey],
   2186             chunk_select,
   2187             drop_axes,
   2188             out_is_ndarray,
   2189             fields,
   2190             out_select,
   2191             partial_read_decode=partial_read_decode,
   2192         )
   2193     else:
   2194         # check exception type
   2195         if self._fill_value is not None:

File ~/code/zarr/zarr/core.py:2057, in Array._process_chunk(self, out, cdata, chunk_selection, drop_axes, out_is_ndarray, fields, out_selection, partial_read_decode)
   2055             chunk = ensure_ndarray_like(cdata).view(self._dtype)
   2056             chunk = chunk.reshape(self._chunks, order=self._order)
-> 2057             np.copyto(dest, chunk)
   2058         return
   2060 # decode chunk

ValueError: could not broadcast input array from shape (1000,1) into shape (1000,)

martindurant avatar Jul 25 '24 13:07 martindurant