streaming
streaming copied to clipboard
Unexpected mds format data for json encoding / how to encode list of strings
Create a simple dataset with lists
from torch.utils.data import DataLoader
import streaming
from streaming import StreamingDataset
from streaming.base.converters import dataframe_to_mds
from datasets import Dataset
import pandas as pd
df = pd.DataFrame(
{
"label": [0, 1, 0],
"data1": [["a", "b", "c", "d"], ["a", "b", "c", "d"], ["a", "b", "c", "d"]],
"data2": [[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]],
}
)
df = spark.createDataFrame(df)
def udf_processing(df):
hf_dataset = Dataset.from_pandas(df=df)
for sample in hf_dataset:
yield sample
mds_kwargs = {
"out": "mds",
"columns": {
"data1": "json",
"data2": "json",
"label": "int64",
},
}
dataframe_to_mds(
df,
merge_index=True,
mds_kwargs=mds_kwargs,
udf_iterable=udf_processing,
)
dataset = StreamingDataset(local="mds", remote=None, batch_size=1, predownload=4)
dataloader = DataLoader(dataset, batch_size=1, num_workers=1)
next(iter(dataloader))
Batch result:
{'data1': [('a',), ('b',), ('c',), ('d',)],
'data2': [tensor([1.], dtype=torch.float64),
tensor([2.], dtype=torch.float64),
tensor([3.], dtype=torch.float64),
tensor([4.], dtype=torch.float64)],
'label': tensor([0])}
Expected behavior
I would expect json data to be decoded back into the original form. Is there any way to achieve this behavior?
{'data1': ["a", "b", "c", "d"],
'data2': [1., 2., 3., 4.],
'label': tensor([0])}
Hey @ssharpe42 , there are two ways to address this:
Hey @ssharpe42, just wondering if the above worked for you.