Error with combining kerchunk mappings with MultiZarrToZarr
I have many *.nc files in a directory that I am opening into a single dataset with kerchunk. The nc files are shape (N,1), where dimension of shape 1 is the dimension I want to concat along. Here is a minimally reproducible example:
from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
import xarray as xr
import numpy as np
# create toy data
ds1 = xr.Dataset({'A':xr.DataArray(np.random.rand(1000,1), dims=['x','y'], coords={'x':np.arange(1000),'y':[0]})})
ds2 = xr.Dataset({'A':xr.DataArray(np.random.rand(1000,1), dims=['x','y'], coords={'x':np.arange(1000),'y':[1]})})
ds1.to_netcdf('ds1.nc')
ds2.to_netcdf('ds2.nc')
# create kerchunk mapping
mappings = [SingleHdf5ToZarr('ds1.nc').translate(), SingleHdf5ToZarr('ds2.nc').translate()]
# open dataset with xarray
mzz = MultiZarrToZarr(mappings, concat_dims=['y'], identical_dims=['x']).translate()
so = {'fo': mzz}
ds = xr.open_dataset(
"reference://", engine="zarr", backend_kwargs={"consolidated": False, "storage_options": so}
).chunk({'y':1})
This works and if I call ds['A'].values, everything is loaded correctly, but if I try to load a single slice in the y dimension (i.e. ds['A'][:,0].values, I get the error:
ValueError: could not broadcast input array from shape (1000,1) into shape (1000,)
If I call ds['A'][:,:1], (preserving the y dimension with shape one), it works as expected.
Here is the contents of the mzz variable:
[{'version': 1,
'refs': {'.zgroup': '{"zarr_format":2}',
'A/.zarray': '{"chunks":[1000,1],"compressor":null,"dtype":"<f8","fill_value":"NaN","filters":null,"order":"C","shape":[1000,1],"zarr_format":2}',
'A/.zattrs': '{"_ARRAY_DIMENSIONS":["x","y"]}',
'A/0.0': ['ds1.nc', 17256, 8000],
'x/.zarray': '{"chunks":[1000],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1000],"zarr_format":2}',
'x/.zattrs': '{"_ARRAY_DIMENSIONS":["x"]}',
'x/0': ['ds1.nc', 983, 8000],
'y/.zarray': '{"chunks":[1],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1],"zarr_format":2}',
'y/.zattrs': '{"_ARRAY_DIMENSIONS":["y"]}',
'y/0': '\x00\x00\x00\x00\x00\x00\x00\x00'}},
{'version': 1,
'refs': {'.zgroup': '{"zarr_format":2}',
'A/.zarray': '{"chunks":[1000,1],"compressor":null,"dtype":"<f8","fill_value":"NaN","filters":null,"order":"C","shape":[1000,1],"zarr_format":2}',
'A/.zattrs': '{"_ARRAY_DIMENSIONS":["x","y"]}',
'A/0.0': ['ds2.nc', 17256, 8000],
'x/.zarray': '{"chunks":[1000],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1000],"zarr_format":2}',
'x/.zattrs': '{"_ARRAY_DIMENSIONS":["x"]}',
'x/0': ['ds2.nc', 983, 8000],
'y/.zarray': '{"chunks":[1],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1],"zarr_format":2}',
'y/.zattrs': '{"_ARRAY_DIMENSIONS":["y"]}',
'y/0': '\x01\x00\x00\x00\x00\x00\x00\x00'}}]
I'm currently running:
- python: 3.12.4
- xarray: 2024.6.0
- kerchunk: 0.2.6
- zarr: 2.18.2
- numpy: 2.0.0
Thanks!
Here is the contents of the mzz variable:
That is the not the content of the mzz variable. It is the list of kerchunk'd references
This has come up before, I'll try to find it.
For completeness, here is what mzz looks like for me:
{'version': 1,
'refs': {'.zgroup': '{"zarr_format":2}',
'y/.zarray': '{\n "chunks": [\n 2\n ],\n "compressor": null,\n "dtype": "<i8",\n "fill_value": null,\n "filters": null,\n "order": "C",\n "shape": [\n 2\n ],\n "zarr_format": 2\n}',
'y/0': '\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00',
'y/.zattrs': '{\n "_ARRAY_DIMENSIONS": [\n "y"\n ]\n}',
'A/.zarray': '{"chunks":[1000,1],"compressor":null,"dtype":"<f8","fill_value":"NaN","filters":null,"order":"C","shape":[1000,2],"zarr_format":2}',
'A/.zattrs': '{"_ARRAY_DIMENSIONS":["x","y"]}',
'A/0.0': ['ds1.nc', 17256, 8000],
'x/.zarray': '{"chunks":[1000],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[1000],"zarr_format":2}',
'x/.zattrs': '{"_ARRAY_DIMENSIONS":["x"]}',
'x/0': ['ds1.nc', 983, 8000],
'A/0.1': ['ds2.nc', 17256, 8000]}}
and here is the exception
File ~/conda/envs/py310/lib/python3.10/site-packages/xarray/core/indexing.py:554, in LazilyIndexedArray.get_duck_array(self)
553 def get_duck_array(self):
--> 554 array = self.array[self.key]
555 # self.array[self.key] is now a numpy array when
556 # self.array is a BackendArray subclass
557 # and self.key is BasicIndexer((slice(None, None, None),))
558 # so we need the explicit check for ExplicitlyIndexed
559 if isinstance(array, ExplicitlyIndexed):
File ~/conda/envs/py310/lib/python3.10/site-packages/xarray/backends/zarr.py:94, in ZarrArrayWrapper.__getitem__(self, key)
92 array = self.get_array()
93 if isinstance(key, indexing.BasicIndexer):
---> 94 return array[key.tuple]
95 elif isinstance(key, indexing.VectorizedIndexer):
96 return array.vindex[
97 indexing._arrayize_vectorized_indexer(key, self.shape).tuple
98 ]
File ~/code/zarr/zarr/core.py:798, in Array.__getitem__(self, selection)
796 result = self.vindex[selection]
797 elif is_pure_orthogonal_indexing(pure_selection, self.ndim):
--> 798 result = self.get_orthogonal_selection(pure_selection, fields=fields)
799 else:
800 result = self.get_basic_selection(pure_selection, fields=fields)
File ~/code/zarr/zarr/core.py:1080, in Array.get_orthogonal_selection(self, selection, out, fields)
1077 # setup indexer
1078 indexer = OrthogonalIndexer(selection, self)
-> 1080 return self._get_selection(indexer=indexer, out=out, fields=fields)
File ~/code/zarr/zarr/core.py:1343, in Array._get_selection(self, indexer, out, fields)
1340 if math.prod(out_shape) > 0:
1341 # allow storage to get multiple items at once
1342 lchunk_coords, lchunk_selection, lout_selection = zip(*indexer)
-> 1343 self._chunk_getitems(
1344 lchunk_coords,
1345 lchunk_selection,
1346 out,
1347 lout_selection,
1348 drop_axes=indexer.drop_axes,
1349 fields=fields,
1350 )
1351 if out.shape:
1352 return out
File ~/code/zarr/zarr/core.py:2183, in Array._chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection, drop_axes, fields)
2181 for ckey, chunk_select, out_select in zip(ckeys, lchunk_selection, lout_selection):
2182 if ckey in cdatas:
-> 2183 self._process_chunk(
2184 out,
2185 cdatas[ckey],
2186 chunk_select,
2187 drop_axes,
2188 out_is_ndarray,
2189 fields,
2190 out_select,
2191 partial_read_decode=partial_read_decode,
2192 )
2193 else:
2194 # check exception type
2195 if self._fill_value is not None:
File ~/code/zarr/zarr/core.py:2057, in Array._process_chunk(self, out, cdata, chunk_selection, drop_axes, out_is_ndarray, fields, out_selection, partial_read_decode)
2055 chunk = ensure_ndarray_like(cdata).view(self._dtype)
2056 chunk = chunk.reshape(self._chunks, order=self._order)
-> 2057 np.copyto(dest, chunk)
2058 return
2060 # decode chunk
ValueError: could not broadcast input array from shape (1000,1) into shape (1000,)