Align multiple DataFrames to a Dataset - MOTrainer Documentation

An example notebook that reads data and convert it to xarray Dataset with aligned time and space axis.

The data used in this notebook is attached with the GitHub repository of motrainer. They can be found via this link.

Import libraries and set paths¶

In [1]:

Copied!





import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt # only for plots
from pathlib import Path
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt # only for plots
from pathlib import Path

In [2]:

Copied!

data_dir = "./example_data/dssat_s1/"
nc_file_path = "./example2_data.nc"
zarr_file_path = "./example2_data.zarr"
data_dir = "./example_data/dssat_s1/"
nc_file_path = "./example2_data.nc"
zarr_file_path = "./example2_data.zarr"

Read the data and explore it¶

In [3]:

Copied!

pickle_files_path = list(Path(data_dir).rglob('*.pkl'))
pickle_files_path
pickle_files_path = list(Path(data_dir).rglob('*.pkl'))
pickle_files_path

Out[3]:

[PosixPath('example_data/dssat_s1/DSSAT/brabant_LAI.pkl'),
 PosixPath('example_data/dssat_s1/DSSAT/brabant_CWAD.pkl'),
 PosixPath('example_data/dssat_s1/DSSAT/brabant_SWTD.pkl'),
 PosixPath('example_data/dssat_s1/DSSAT/brabant_SWTD6.pkl'),
 PosixPath('example_data/dssat_s1/Sentinel-1/Amp_CR_New.pkl')]

In [4]:

Copied!





for file in pickle_files_path:
    df = pd.read_pickle(file)  # read file as pandas dataframe
    print(file.stem)  # check file name
    print(df.index.dtype)  # check index type
    print(len(df.index))  # check length of index
    print(len(df.columns))  # check length of columns
    print("********")
for file in pickle_files_path:
    df = pd.read_pickle(file)  # read file as pandas dataframe
    print(file.stem)  # check file name
    print(df.index.dtype)  # check index type
    print(len(df.index))  # check length of index
    print(len(df.columns))  # check length of columns
    print("********")

brabant_LAI
datetime64[ns]
148
1283
********
brabant_CWAD
datetime64[ns]
148
1283
********
brabant_SWTD
datetime64[ns]
274
1283
********
brabant_SWTD6
datetime64[ns]
274
1283
********
Amp_CR_New
object
60
21927
********

Convert data¶

As seen above, the type of index of data are not consitent among different files e.g. "object" and "datetime64[ns]". Below, we convert it to the same type.

In [5]:

Copied!





# Read the data
ds_list = []
for file in pickle_files_path:
    # read files and extract filename
    df = pd.read_pickle(file)
    var_name = file.stem

    # Check the dtype of the index. If it's not datetime64[ns], convert it.
    if df.index.dtype != "datetime64[ns]":
        df.index = pd.to_datetime(df.index)

    # convert dataframe to dataset
    ds = xr.Dataset({var_name: (["time", "space"], df.values)},
                     coords={"time": df.index, "space": df.columns})
    ds_list.append(ds)

# Create one dataset
dataset = xr.concat(ds_list, dim="time")

# Add attribute (metadata)
dataset.attrs['source'] = 'data source'
dataset.attrs['license'] = 'data license'
# Read the data
ds_list = []
for file in pickle_files_path:
    # read files and extract filename
    df = pd.read_pickle(file)
    var_name = file.stem

    # Check the dtype of the index. If it's not datetime64[ns], convert it.
    if df.index.dtype != "datetime64[ns]":
        df.index = pd.to_datetime(df.index)

    # convert dataframe to dataset
    ds = xr.Dataset({var_name: (["time", "space"], df.values)},
                     coords={"time": df.index, "space": df.columns})
    ds_list.append(ds)

# Create one dataset
dataset = xr.concat(ds_list, dim="time")

# Add attribute (metadata)
dataset.attrs['source'] = 'data source'
dataset.attrs['license'] = 'data license'

Inspect output and store it¶

In [7]:

Copied!

# Time series of one variable at one location
Amp_CR_New = dataset.Amp_CR_New.isel(space=0)
Amp_CR_New.plot()
# Time series of one variable at one location
Amp_CR_New = dataset.Amp_CR_New.isel(space=0)
Amp_CR_New.plot()

Out[7]:

[<matplotlib.lines.Line2D at 0x7f28a87059d0>]

No description has been provided for this image

In [8]:

Copied!

# Save data in netcdf format
dataset.to_netcdf(nc_file_path)
# Save data in netcdf format
dataset.to_netcdf(nc_file_path)

In [9]:

Copied!

# For large dataset, chunk the data and save data in zarr format
dataset.chunk({'space':1000})
dataset.to_zarr(zarr_file_path)
# For large dataset, chunk the data and save data in zarr format
dataset.chunk({'space':1000})
dataset.to_zarr(zarr_file_path)

Out[9]:

<xarray.backends.zarr.ZarrStore at 0x7f28909af8b0>