from_pandas does not recognize index with timezone
Versions
- Python
3.7 - pandas
1.1.5 - koalas
1.7.0
Code Snippet to Reproduce
import pandas as pd
import databricks.koalas as ks
# create pandas dataframe
index = pd.date_range('2020-01-01', '2020-01-07', tz='UTC')
data = list(range(len(index)))
pdf = pd.DataFrame(data=data, index=index, columns=['foo'])
# convert to koalas dataframe using from_pandas
kdf = ks.from_pandas(pdf)
When executed the above code snippet, it raises TypeError: Type datetime64[ns, UTC] was not understood.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-6-d768945ae18d> in <module>
1 # convert to koalas dataframe
----> 2 kdf = ks.from_pandas(pdf)
~/opt/miniconda3/lib/python3.7/site-packages/databricks/koalas/namespace.py in from_pandas(pobj)
123 return Series(pobj)
124 elif isinstance(pobj, pd.DataFrame):
--> 125 return DataFrame(pobj)
126 elif isinstance(pobj, pd.Index):
127 return DataFrame(pd.DataFrame(index=pobj)).index
~/opt/miniconda3/lib/python3.7/site-packages/databricks/koalas/frame.py in __init__(self, data, index, columns, dtype, copy)
490 else:
491 pdf = pd.DataFrame(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
--> 492 internal = InternalFrame.from_pandas(pdf)
493
494 object.__setattr__(self, "_internal_frame", internal)
~/opt/miniconda3/lib/python3.7/site-packages/databricks/koalas/internal.py in from_pandas(pdf)
1298 name, infer_pd_series_spark_type(col), nullable=bool(col.isnull().any()),
1299 )
-> 1300 for name, col in reset_index.iteritems()
1301 ]
1302 )
~/opt/miniconda3/lib/python3.7/site-packages/databricks/koalas/internal.py in <listcomp>(.0)
1298 name, infer_pd_series_spark_type(col), nullable=bool(col.isnull().any()),
1299 )
-> 1300 for name, col in reset_index.iteritems()
1301 ]
1302 )
~/opt/miniconda3/lib/python3.7/site-packages/databricks/koalas/typedef/typehints.py in infer_pd_series_spark_type(s)
265 return from_arrow_type(pa.Array.from_pandas(s).type)
266 else:
--> 267 return as_spark_type(dt)
268
269
~/opt/miniconda3/lib/python3.7/site-packages/databricks/koalas/typedef/typehints.py in as_spark_type(tpe, raise_error)
204
205 if raise_error:
--> 206 raise TypeError("Type %s was not understood." % tpe)
207 else:
208 return None
TypeError: Type datetime64[ns, UTC] was not understood.
Unfortunately, Koalas doesn't support DatetimeIndex with specifying timezone yet.
DatetimeIndex is a feature recently added to more strictly support data types, and there are still many missing features for now.
I am trying to upgrade from 1.5.0 to 1.8.0, and my unit tests are failing because of a "TypeError: Type datetime64[ns, Timezone('UTC')] was not understood." on a non-index column. Here's a quick reproduction. This works on 1.5.0 and fails on 1.8.0 (it also seems to fail on 1.6.0 as well). Should I file this as a bug or are timezone-aware datetimes no longer supported?
from databricks import koalas
import datetime
import pandas
df = pandas.DataFrame({
"time": [datetime.datetime.now(tz=datetime.timezone.utc)],
})
koalas.from_pandas(df)
# TypeError: Type datetime64[ns, UTC] was not understood.