Thank you for producing this pipeline - it's quite helpful when dealing with huge sc datasets! I'm running into an issue when I try to merge two zarr objects.
I'm actually not throwing an error until I get to the mark_hvgs() step, but it's related to the ZarrMerge step not outputting cell counts by feature (i.e., the .zarr directory has no data files in test.zarr -> RNA -> counts).
I would have expected that when I merge the two .zarr objects, that the DataStore object would not have "RNA assay has 0 (22032) features". All nCells are equal to 0. It's clear that the merge somehow did not summarize the counts. I played with the overwrite option and adjusting the update_key based on the vignette, but I can't seem to get the zarr files merged correctly. I am attempting to merge 115 .zarr dirs, but this seems to be problematic with only two .zarr.
The code I used is below (apologies if anything reported incorrectly, I haven't reported a ton of issues on git):
(RNA) Computing nCells and dropOuts: 100%|
1150/1150 [00:00]
(RNA) Computing nCounts: 100%|
1155/1155 [00:00]
WARNING: Minimum cell count (0) is lower than size factor multiplier (1000)
(RNA) Computing nFeatures: 100%|
1155/1155 [00:00]
(RNA) Computing RNA_percentMito: 100%|
1113/1113 [00:00]
WARNING: Percentage feature RNA_percentMito not added because not detected in any cell
(RNA) Computing RNA_percentRibo: 100%|
1113/1113 [00:00]
WARNING: Percentage feature RNA_percentRibo not added because not detected in any cell
WARNING: More than of half of the less have less than 10 features for assay: RNA. Will not remove low quality cells automatically.
DataStore has 20369 (20369) cells with 1 assays: RNA
Cell metadata:
'I', 'ids', 'names', 'RNA_nCounts', 'RNA_nFeatures',
'orig_RNA_nCounts', 'orig_RNA_nFeatures', 'orig_RNA_percentMito', 'orig_RNA_percentRibo'
RNA assay has 0 (22032) features and following metadata:
'I', 'ids', 'names', 'dropOuts', 'nCells',
/opt/anaconda3/lib/python3.9/site-packages/dask/array/slicing.py:647: RuntimeWarning: divide by zero encountered in long_scalars
maxsize = math.ceil(nbytes / (other_numel * itemsize))
OverflowError Traceback (most recent call last)
/var/folders/b4/14kj85s94x52tlh_ptt0vf6r0000gn/T/ipykernel_63026/3363267796.py in <module>
----> 1 ds.mark_hvgs(
2 min_cells=20,
3 top_n=2000,
4 min_mean=-3,
5 max_mean=2,
/opt/anaconda3/lib/python3.9/site-packages/scarf/datastore.py in mark_hvgs(self, from_assay, cell_key, min_cells, top_n, min_var, max_var, min_mean, max_mean, n_bins, lowess_frac, blacklist, show_plot, hvg_key_name, **plot_kwargs)
3604 f"of cells will be considered HVGs."
3605 )
-> 3606 assay.mark_hvgs(
3607 cell_key,
3608 min_cells,
/opt/anaconda3/lib/python3.9/site-packages/scarf/assay.py in mark_hvgs(self, cell_key, min_cells, top_n, min_var, max_var, min_mean, max_mean, n_bins, lowess_frac, blacklist, hvg_key_name, show_plot, **plot_kwargs)
938 return f"{identifier}_{x}"
939
--> 940 self.set_feature_stats(cell_key, min_cells)
941 identifier = self._load_stats_loc(cell_key)
942 c_var_col = f"c_var__{n_bins}__{lowess_frac}"
/opt/anaconda3/lib/python3.9/site-packages/scarf/assay.py in set_feature_stats(self, cell_key, min_cells)
830 return None
831 n_cells = show_dask_progress(
--> 832 (self.normed(cell_idx, feat_idx) > 0).sum(axis=0),
833 f"({self.name}) Computing nCells",
834 self.nthreads,
/opt/anaconda3/lib/python3.9/site-packages/scarf/assay.py in normed(self, cell_idx, feat_idx, renormalize_subset, log_transform, **kwargs)
792 if feat_idx is None:
793 feat_idx = self.feats.active_index("I")
--> 794 counts = self.rawData[:, feat_idx][cell_idx, :]
795 norm_method_cache = self.normMethod
796 if log_transform:
/opt/anaconda3/lib/python3.9/site-packages/dask/array/core.py in __getitem__(self, index)
1798
1799 out = "getitem-" + tokenize(self, index2)
-> 1800 dsk, chunks = slice_array(out, self.name, self.chunks, index2, self.itemsize)
1801
1802 graph = HighLevelGraph.from_collections(out, dsk, dependencies=[self])
/opt/anaconda3/lib/python3.9/site-packages/dask/array/slicing.py in slice_array(out_name, in_name, blockdims, index, itemsize)
172
173 # Pass down to next function
--> 174 dsk_out, bd_out = slice_with_newaxes(out_name, in_name, blockdims, index, itemsize)
175
176 bd_out = tuple(map(tuple, bd_out))
/opt/anaconda3/lib/python3.9/site-packages/dask/array/slicing.py in slice_with_newaxes(out_name, in_name, blockdims, index, itemsize)
194
195 # Pass down and do work
--> 196 dsk, blockdims2 = slice_wrap_lists(out_name, in_name, blockdims, index2, itemsize)
197
198 if where_none:
/opt/anaconda3/lib/python3.9/site-packages/dask/array/slicing.py in slice_wrap_lists(out_name, in_name, blockdims, index, itemsize)
260 if all(is_arraylike(i) or i == slice(None, None, None) for i in index):
261 axis = where_list[0]
--> 262 blockdims2, dsk3 = take(
263 out_name, in_name, blockdims, index[where_list[0]], itemsize, axis=axis
264 )
/opt/anaconda3/lib/python3.9/site-packages/dask/array/slicing.py in take(outname, inname, chunks, index, itemsize, axis)
645 warnsize = maxsize = math.inf
646 else:
--> 647 maxsize = math.ceil(nbytes / (other_numel * itemsize))
648 warnsize = maxsize * 5
649
OverflowError: cannot convert float infinity to integer