{ "cells": [ { "cell_type": "markdown", "id": "3185cd90-7614-4ab5-8234-04721abf74e9", "metadata": {}, "source": [ "An example notebook that reads data and convert it to xarray Dataset with aligned time and space axis.\n", "\n", "The data used in this notebook is attached with the GitHub repository of `motrainer`. They can be found via [this link](https://github.com/VegeWaterDynamics/motrainer/tree/main/docs/notebooks/example_data). \n", "\n", "This notebook generates the example dataset `./example1_data.zarr/` for the following example notebooks:\n", "\n", "- [Prallely training sklearn models with dask-ml](https://vegewaterdynamics.github.io/motrainer/notebooks/example_daskml/)\n", "- [Prallely training DNN with Tensorflow](https://vegewaterdynamics.github.io/motrainer/notebooks/example_dnn/)" ] }, { "cell_type": "markdown", "id": "c6db8da8-c6e2-4330-85db-5df185df62cb", "metadata": {}, "source": [ "## Import libraries and set paths" ] }, { "cell_type": "code", "execution_count": 1, "id": "47d44c1e-28a4-4d53-934c-1f32e8989269", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import xarray as xr\n", "import matplotlib.pyplot as plt # only for plots" ] }, { "cell_type": "code", "execution_count": 2, "id": "06e45086-3e05-4d66-a2a4-e22e067eea40", "metadata": {}, "outputs": [], "source": [ "pickle_file_path = \"./example_data/example_data.pickle\"\n", "nc_file_path = \"./example1_data.nc\"\n", "zarr_file_path = \"./example1_data.zarr\"" ] }, { "cell_type": "markdown", "id": "a18a1b54-3f63-4161-9fcf-77e466057225", "metadata": {}, "source": [ "## Read the data and explore it" ] }, { "cell_type": "code", "execution_count": 3, "id": "b37baf27-a457-453b-affe-808798fed525", "metadata": {}, "outputs": [], "source": [ "# Read the data\n", "df_all_gpi = pd.read_pickle(pickle_file_path)" ] }, { "cell_type": "code", "execution_count": 4, "id": "dc298f63-71d7-4ff7-ab0a-d97717191079", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | lat | \n", "lon | \n", "data | \n", "
---|---|---|---|
1 | \n", "56.125 | \n", "11.375 | \n", "sig slop curv ... | \n", "
2 | \n", "46.125 | \n", "6.625 | \n", "sig slop curv ... | \n", "
3 | \n", "53.375 | \n", "6.125 | \n", "sig slop curv ... | \n", "
4 | \n", "49.375 | \n", "12.375 | \n", "sig slop curv ... | \n", "
5 | \n", "44.375 | \n", "0.625 | \n", "sig slop curv ... | \n", "
\n", " | sig | \n", "slop | \n", "curv | \n", "TG1 | \n", "TG2 | \n", "TG3 | \n", "WG1 | \n", "WG2 | \n", "WG3 | \n", "BIOMA1 | \n", "BIOMA2 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
datetime_doy | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
2007-01-02 | \n", "-8.774847 | \n", "-0.118061 | \n", "-0.001871 | \n", "282.495667 | \n", "277.571790 | \n", "280.432019 | \n", "0.353169 | \n", "0.297954 | \n", "0.316928 | \n", "0.055779 | \n", "0.064610 | \n", "
2007-01-03 | \n", "-8.737255 | \n", "-0.116761 | \n", "-0.001753 | \n", "283.059404 | \n", "278.609833 | \n", "279.851678 | \n", "0.224477 | \n", "0.336283 | \n", "0.303121 | \n", "0.057188 | \n", "0.007182 | \n", "
2007-01-03 | \n", "-8.791911 | \n", "-0.118357 | \n", "-0.002037 | \n", "284.386143 | \n", "278.075722 | \n", "285.383157 | \n", "0.378645 | \n", "0.250349 | \n", "0.335715 | \n", "0.062280 | \n", "0.043909 | \n", "
2007-01-05 | \n", "-7.962205 | \n", "-0.118063 | \n", "-0.002072 | \n", "276.947048 | \n", "277.841682 | \n", "277.941320 | \n", "0.305945 | \n", "0.332280 | \n", "0.315607 | \n", "0.052877 | \n", "0.017596 | \n", "
2007-01-06 | \n", "-8.607216 | \n", "-0.118727 | \n", "-0.002048 | \n", "276.458553 | \n", "282.783491 | \n", "277.956962 | \n", "0.380480 | \n", "0.364697 | \n", "0.280530 | \n", "0.051309 | \n", "0.034444 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2019-12-30 | \n", "-8.824627 | \n", "-0.119621 | \n", "-0.000872 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
2019-12-31 | \n", "-8.578708 | \n", "-0.121446 | \n", "-0.001059 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
2019-12-31 | \n", "-8.731547 | \n", "-0.119538 | \n", "-0.000887 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
2020-01-01 | \n", "-7.358630 | \n", "-0.122284 | \n", "-0.000725 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
2020-01-01 | \n", "-9.165778 | \n", "-0.123732 | \n", "-0.000753 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
7995 rows × 11 columns
\n", "<xarray.Dataset>\n", "Dimensions: (time: 8506, space: 5)\n", "Coordinates:\n", " * time (time) datetime64[ns] 2007-01-02 ... 2020-01-01T01:00:00\n", " latitude (space) float64 56.12 46.12 53.38 49.38 44.38\n", " longitude (space) float64 11.38 6.625 6.125 12.38 0.625\n", "Dimensions without coordinates: space\n", "Data variables:\n", " sig (space, time) float64 -9.49 -8.494 -9.069 ... -8.071 -8.237\n", " slop (space, time) float64 -0.1208 -0.1178 -0.121 ... -0.1144 -0.1191\n", " curv (space, time) float64 -0.001396 -0.001464 ... -0.0006173\n", " TG1 (space, time) float64 280.0 270.4 285.5 277.4 ... nan nan nan nan\n", " TG2 (space, time) float64 274.8 278.4 280.6 283.7 ... nan nan nan nan\n", " TG3 (space, time) float64 280.9 279.7 278.0 278.0 ... nan nan nan nan\n", " WG1 (space, time) float64 0.3249 0.2798 0.2773 0.2867 ... nan nan nan\n", " WG2 (space, time) float64 0.3408 0.2902 0.3373 0.2709 ... nan nan nan\n", " WG3 (space, time) float64 0.3123 0.2916 0.2891 0.3538 ... nan nan nan\n", " BIOMA1 (space, time) float64 0.07079 0.05532 0.04846 ... nan nan nan\n", " BIOMA2 (space, time) float64 0.04366 0.0462 0.03821 ... nan nan nan\n", "Attributes:\n", " source: data source\n", " license: data license