{ "cells": [ { "cell_type": "markdown", "id": "3185cd90-7614-4ab5-8234-04721abf74e9", "metadata": {}, "source": [ "An example notebook that reads data and convert it to xarray Dataset with aligned time and space axis.\n", "\n", "The data used in this notebook is attached with the GitHub repository of `motrainer`. They can be found via [this link](https://github.com/VegeWaterDynamics/motrainer/tree/main/docs/notebooks/example_data). \n", "\n", "This notebook generates the example dataset `./example1_data.zarr/` for the following example notebooks:\n", "\n", "- [Prallely training sklearn models with dask-ml](https://vegewaterdynamics.github.io/motrainer/notebooks/example_daskml/)\n", "- [Prallely training DNN with Tensorflow](https://vegewaterdynamics.github.io/motrainer/notebooks/example_dnn/)" ] }, { "cell_type": "markdown", "id": "c6db8da8-c6e2-4330-85db-5df185df62cb", "metadata": {}, "source": [ "## Import libraries and set paths" ] }, { "cell_type": "code", "execution_count": 1, "id": "47d44c1e-28a4-4d53-934c-1f32e8989269", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import xarray as xr\n", "import matplotlib.pyplot as plt # only for plots" ] }, { "cell_type": "code", "execution_count": 2, "id": "06e45086-3e05-4d66-a2a4-e22e067eea40", "metadata": {}, "outputs": [], "source": [ "pickle_file_path = \"./example_data/example_data.pickle\"\n", "nc_file_path = \"./example1_data.nc\"\n", "zarr_file_path = \"./example1_data.zarr\"" ] }, { "cell_type": "markdown", "id": "a18a1b54-3f63-4161-9fcf-77e466057225", "metadata": {}, "source": [ "## Read the data and explore it" ] }, { "cell_type": "code", "execution_count": 3, "id": "b37baf27-a457-453b-affe-808798fed525", "metadata": {}, "outputs": [], "source": [ "# Read the data\n", "df_all_gpi = pd.read_pickle(pickle_file_path)" ] }, { "cell_type": "code", "execution_count": 4, "id": "dc298f63-71d7-4ff7-ab0a-d97717191079", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
latlondata
156.12511.375sig slop curv ...
246.1256.625sig slop curv ...
353.3756.125sig slop curv ...
449.37512.375sig slop curv ...
544.3750.625sig slop curv ...
\n", "
" ], "text/plain": [ " lat lon data\n", "1 56.125 11.375 sig slop curv ...\n", "2 46.125 6.625 sig slop curv ...\n", "3 53.375 6.125 sig slop curv ...\n", "4 49.375 12.375 sig slop curv ...\n", "5 44.375 0.625 sig slop curv ..." ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_all_gpi" ] }, { "cell_type": "code", "execution_count": 5, "id": "7631ef10-2ceb-4dd3-b0f7-015ce657898d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sigslopcurvTG1TG2TG3WG1WG2WG3BIOMA1BIOMA2
datetime_doy
2007-01-02-8.774847-0.118061-0.001871282.495667277.571790280.4320190.3531690.2979540.3169280.0557790.064610
2007-01-03-8.737255-0.116761-0.001753283.059404278.609833279.8516780.2244770.3362830.3031210.0571880.007182
2007-01-03-8.791911-0.118357-0.002037284.386143278.075722285.3831570.3786450.2503490.3357150.0622800.043909
2007-01-05-7.962205-0.118063-0.002072276.947048277.841682277.9413200.3059450.3322800.3156070.0528770.017596
2007-01-06-8.607216-0.118727-0.002048276.458553282.783491277.9569620.3804800.3646970.2805300.0513090.034444
....................................
2019-12-30-8.824627-0.119621-0.000872NaNNaNNaNNaNNaNNaNNaNNaN
2019-12-31-8.578708-0.121446-0.001059NaNNaNNaNNaNNaNNaNNaNNaN
2019-12-31-8.731547-0.119538-0.000887NaNNaNNaNNaNNaNNaNNaNNaN
2020-01-01-7.358630-0.122284-0.000725NaNNaNNaNNaNNaNNaNNaNNaN
2020-01-01-9.165778-0.123732-0.000753NaNNaNNaNNaNNaNNaNNaNNaN
\n", "

7995 rows × 11 columns

\n", "
" ], "text/plain": [ " sig slop curv TG1 TG2 \\\n", "datetime_doy \n", "2007-01-02 -8.774847 -0.118061 -0.001871 282.495667 277.571790 \n", "2007-01-03 -8.737255 -0.116761 -0.001753 283.059404 278.609833 \n", "2007-01-03 -8.791911 -0.118357 -0.002037 284.386143 278.075722 \n", "2007-01-05 -7.962205 -0.118063 -0.002072 276.947048 277.841682 \n", "2007-01-06 -8.607216 -0.118727 -0.002048 276.458553 282.783491 \n", "... ... ... ... ... ... \n", "2019-12-30 -8.824627 -0.119621 -0.000872 NaN NaN \n", "2019-12-31 -8.578708 -0.121446 -0.001059 NaN NaN \n", "2019-12-31 -8.731547 -0.119538 -0.000887 NaN NaN \n", "2020-01-01 -7.358630 -0.122284 -0.000725 NaN NaN \n", "2020-01-01 -9.165778 -0.123732 -0.000753 NaN NaN \n", "\n", " TG3 WG1 WG2 WG3 BIOMA1 BIOMA2 \n", "datetime_doy \n", "2007-01-02 280.432019 0.353169 0.297954 0.316928 0.055779 0.064610 \n", "2007-01-03 279.851678 0.224477 0.336283 0.303121 0.057188 0.007182 \n", "2007-01-03 285.383157 0.378645 0.250349 0.335715 0.062280 0.043909 \n", "2007-01-05 277.941320 0.305945 0.332280 0.315607 0.052877 0.017596 \n", "2007-01-06 277.956962 0.380480 0.364697 0.280530 0.051309 0.034444 \n", "... ... ... ... ... ... ... \n", "2019-12-30 NaN NaN NaN NaN NaN NaN \n", "2019-12-31 NaN NaN NaN NaN NaN NaN \n", "2019-12-31 NaN NaN NaN NaN NaN NaN \n", "2020-01-01 NaN NaN NaN NaN NaN NaN \n", "2020-01-01 NaN NaN NaN NaN NaN NaN \n", "\n", "[7995 rows x 11 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_all_gpi.iloc[3][\"data\"]" ] }, { "cell_type": "markdown", "id": "f9dffc7e-489a-4588-bab1-456ed0fb2db3", "metadata": {}, "source": [ "## Convert data\n", "\n", "As seen above, the \"datetime_doy\" values are not unique. While it's possible to have non-unique index values, it's generally not recommended. Having a non-unique index can make certain operations and data manipulation more complex, or even incorrect. These values shows two observation at one day. To avoid duplication, we add a hal-hour shift." ] }, { "cell_type": "code", "execution_count": 6, "id": "1ec3de51-f368-4d17-a307-aac3f39ac1c0", "metadata": {}, "outputs": [], "source": [ "# Function to make timestamps unique by adding half an hour\n", "def make_timestamps_unique(df):\n", " seen_timestamps = set()\n", " new_index = []\n", "\n", " for timestamp in df.index:\n", " if timestamp not in seen_timestamps:\n", " new_index.append(timestamp)\n", " seen_timestamps.add(timestamp)\n", " else:\n", " # Timestamp is a duplicate, add half an hour\n", " while timestamp in seen_timestamps:\n", " timestamp += pd.Timedelta(minutes=30)\n", " new_index.append(timestamp)\n", " seen_timestamps.add(timestamp)\n", " \n", " df.index = new_index\n", " df.index.name = \"time\"\n", " return df" ] }, { "cell_type": "code", "execution_count": 7, "id": "742333d4-9796-4248-ab19-c06c854d8bdb", "metadata": {}, "outputs": [], "source": [ "ds_list = []\n", "for index, row in df_all_gpi.iterrows():\n", " \n", " # Filter the nested DataFrame based on location\n", " df = df_all_gpi.iloc[index-1][\"data\"]\n", "\n", " # Make timestamps unique\n", " df = make_timestamps_unique(df)\n", "\n", " # convert dataframe to dataset\n", " ds = xr.Dataset(df, coords={'latitude': row[\"lat\"], 'longitude': row[\"lon\"]})\n", " ds_list.append(ds)\n", "\n", "# Create one dataset\n", "dataset = xr.concat(ds_list, dim=\"space\")\n", "\n", "# Add attribute (metadata)\n", "dataset.attrs['source'] = 'data source'\n", "dataset.attrs['license'] = 'data license'" ] }, { "cell_type": "markdown", "id": "43bfa92c-aafc-48c4-970d-88979cb1c29d", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "source": [ "## Inspect output and store it" ] }, { "cell_type": "code", "execution_count": 8, "id": "d5358c5c-3fc1-40e4-8d4a-ac52a7d4e242", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.Dataset>\n",
       "Dimensions:    (time: 8506, space: 5)\n",
       "Coordinates:\n",
       "  * time       (time) datetime64[ns] 2007-01-02 ... 2020-01-01T01:00:00\n",
       "    latitude   (space) float64 56.12 46.12 53.38 49.38 44.38\n",
       "    longitude  (space) float64 11.38 6.625 6.125 12.38 0.625\n",
       "Dimensions without coordinates: space\n",
       "Data variables:\n",
       "    sig        (space, time) float64 -9.49 -8.494 -9.069 ... -8.071 -8.237\n",
       "    slop       (space, time) float64 -0.1208 -0.1178 -0.121 ... -0.1144 -0.1191\n",
       "    curv       (space, time) float64 -0.001396 -0.001464 ... -0.0006173\n",
       "    TG1        (space, time) float64 280.0 270.4 285.5 277.4 ... nan nan nan nan\n",
       "    TG2        (space, time) float64 274.8 278.4 280.6 283.7 ... nan nan nan nan\n",
       "    TG3        (space, time) float64 280.9 279.7 278.0 278.0 ... nan nan nan nan\n",
       "    WG1        (space, time) float64 0.3249 0.2798 0.2773 0.2867 ... nan nan nan\n",
       "    WG2        (space, time) float64 0.3408 0.2902 0.3373 0.2709 ... nan nan nan\n",
       "    WG3        (space, time) float64 0.3123 0.2916 0.2891 0.3538 ... nan nan nan\n",
       "    BIOMA1     (space, time) float64 0.07079 0.05532 0.04846 ... nan nan nan\n",
       "    BIOMA2     (space, time) float64 0.04366 0.0462 0.03821 ... nan nan nan\n",
       "Attributes:\n",
       "    source:   data source\n",
       "    license:  data license
" ], "text/plain": [ "\n", "Dimensions: (time: 8506, space: 5)\n", "Coordinates:\n", " * time (time) datetime64[ns] 2007-01-02 ... 2020-01-01T01:00:00\n", " latitude (space) float64 56.12 46.12 53.38 49.38 44.38\n", " longitude (space) float64 11.38 6.625 6.125 12.38 0.625\n", "Dimensions without coordinates: space\n", "Data variables:\n", " sig (space, time) float64 -9.49 -8.494 -9.069 ... -8.071 -8.237\n", " slop (space, time) float64 -0.1208 -0.1178 -0.121 ... -0.1144 -0.1191\n", " curv (space, time) float64 -0.001396 -0.001464 ... -0.0006173\n", " TG1 (space, time) float64 280.0 270.4 285.5 277.4 ... nan nan nan nan\n", " TG2 (space, time) float64 274.8 278.4 280.6 283.7 ... nan nan nan nan\n", " TG3 (space, time) float64 280.9 279.7 278.0 278.0 ... nan nan nan nan\n", " WG1 (space, time) float64 0.3249 0.2798 0.2773 0.2867 ... nan nan nan\n", " WG2 (space, time) float64 0.3408 0.2902 0.3373 0.2709 ... nan nan nan\n", " WG3 (space, time) float64 0.3123 0.2916 0.2891 0.3538 ... nan nan nan\n", " BIOMA1 (space, time) float64 0.07079 0.05532 0.04846 ... nan nan nan\n", " BIOMA2 (space, time) float64 0.04366 0.0462 0.03821 ... nan nan nan\n", "Attributes:\n", " source: data source\n", " license: data license" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "code", "execution_count": 9, "id": "5698dee8-a668-44c5-800c-9357858e36a3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Time series of one variable at one location\n", "sig = dataset.sig.isel(space=0)\n", "sig.plot()" ] }, { "cell_type": "code", "execution_count": 10, "id": "560a1bc8-2095-4186-aa43-c822d37cd22d", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# map of one variable at one time\n", "time = \"2007-01-02T00:00:00.000000000\"\n", "sig = dataset.sig.sel(time=time)\n", "lons, lats = xr.broadcast(sig.longitude, sig.latitude)\n", "plt.figure(figsize=(8, 6))\n", "plt.scatter(lons, lats, c=sig, cmap=\"viridis\", marker=\"o\", s=10)\n", "plt.colorbar(label=\"sig\") \n", "plt.xlabel(\"longitude\")\n", "plt.ylabel(\"latitude\")\n", "plt.title(f\"Plot of sig at time {time}\")\n", "plt.grid(True)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 11, "id": "b7b7dac4-f981-4627-9f56-d5c297f2e769", "metadata": {}, "outputs": [], "source": [ "# Save data in netcdf format\n", "dataset.to_netcdf(nc_file_path)" ] }, { "cell_type": "code", "execution_count": 12, "id": "dbaed4c1-61dc-4d07-8094-740bac0e8e07", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# For large dataset, chunk the data and save data in zarr format\n", "dataset.chunk({'space':1000})\n", "dataset.to_zarr(zarr_file_path)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }