Segmentation fault when using tensorstore with multiple workers in pytorch dataloader

Open paulhager opened this issue 1 year ago • 1 comments

Hi Tensorstore team,

Thanks for the great library, I've found it very useful so far. I recently came across an issue when using tensorstore with multiple workers and a pytorch dataloader though and was hoping someone could help me resolve it.

The following examples need a tensorstore zarr 2 target to be filled at INSERT_PATH_TO_TS_STORE_ZARR_2.

Opening a tensorstore file in the init of a dataset throws a segmentation fault when using multiple workers in a pytorch dataloader.

import torch
from torch.utils.data import DataLoader, Dataset
import tensorstore as ts


class TS_Dataset(Dataset):
    def __init__(self) -> None:
        embeddings_store_path = INSERT_PATH_TO_TS_STORE_ZARR_2

        tensorstore_spec = {
            "driver": "zarr",
            "kvstore": {
                "driver": "file",
                "path": embeddings_store_path,
            },
        }

        self.data = ts.open(tensorstore_spec).result()

    def __getitem__(self, index: int):
        subject = torch.tensor(self.data[0].read().result(), dtype=torch.float32)
        return subject

    def __len__(self) -> int:
        return 2


dataset = TS_Dataset()

dataloader = DataLoader(
    dataset,
    num_workers=2,
)

for batch in dataloader:
    print(batch)

I think this has something to do with multiple initializations of the tensorstore, so I was able to "fix" it with the following modification:

import torch
from torch.utils.data import DataLoader, Dataset
import tensorstore as ts


class TS_Dataset(Dataset):
    def __init__(self) -> None:
        super().__init__()

    def _open_data(self):
        embeddings_store_path = INSERT_PATH_TO_TS_STORE_ZARR_2

        tensorstore_spec = {
            "driver": "zarr",
            "kvstore": {
                "driver": "file",
                "path": embeddings_store_path,
            },
        }

        self.data = ts.open(tensorstore_spec).result()

    def __getitem__(self, index: int):
        if not hasattr(self, "data"):
            self._open_data()
        subject = torch.tensor(self.data[0].read().result(), dtype=torch.float32)
        return subject

    def __len__(self) -> int:
        return 2


dataset = TS_Dataset()

dataloader = DataLoader(
    dataset,
    num_workers=2,
)

for batch in dataloader:
    print(batch)

This worked for a while, but now I have the situation that I need to open this file at a different place in my code and now it breaks again:

import torch
from torch.utils.data import DataLoader, Dataset
import tensorstore as ts


class TS_Dataset(Dataset):
    def __init__(self) -> None:
        super().__init__()

    def _open_data(self):
        embeddings_store_path = INSERT_PATH_TO_TS_STORE_ZARR_2

        tensorstore_spec = {
            "driver": "zarr",
            "kvstore": {
                "driver": "file",
                "path": embeddings_store_path,
            },
        }

        self.data = ts.open(tensorstore_spec).result()

    def __getitem__(self, index: int):
        if not hasattr(self, "data"):
            self._open_data()
        subject = torch.tensor(self.data[0].read().result(), dtype=torch.float32)
        return subject

    def __len__(self) -> int:
        return 2


embeddings_store_path = INSERT_PATH_TO_TS_STORE_ZARR_2

tensorstore_spec = {
    "driver": "zarr",
    "kvstore": {
        "driver": "file",
        "path": embeddings_store_path,
    },
}

data = ts.open(tensorstore_spec).result()

dataset = TS_Dataset()

dataloader = DataLoader(
    dataset,
    num_workers=2,
)

for batch in dataloader:
    print(batch)

If I try to get around this by just opening it once outside and passing it into the dataset, it still doesn't work

import torch
from torch.utils.data import DataLoader, Dataset
import tensorstore as ts


class TS_Dataset(Dataset):
    def __init__(self, open_tensorstore) -> None:
        self.data = open_tensorstore

    def __getitem__(self, index: int):
        subject = torch.tensor(self.data[0].read().result(), dtype=torch.float32)
        return subject

    def __len__(self) -> int:
        return 2


embeddings_store_path = INSERT_PATH_TO_TS_STORE_ZARR_2

tensorstore_spec = {
    "driver": "zarr",
    "kvstore": {
        "driver": "file",
        "path": embeddings_store_path,
    },
}

open_tensorstore = ts.open(tensorstore_spec).result()

dataset = TS_Dataset(open_tensorstore)

dataloader = DataLoader(
    dataset,
    num_workers=2,
)

for batch in dataloader:
    print(batch)

I've kind of run out of ideas at this point and would greatly appreciate any help.

I'm using tensorstore version 0.1.64 and pytorch version 2.2.2.

Let me know if you need any further information

Aug 30 '24 12:08 paulhager

The issue is that tensorstore uses multiple threads internally and therefore isn't fork-safe. It will probably work if you can arrange not to use tensorstore before forking.

Aug 31 '24 14:08 jbms