{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Import cell\n",
    "import os \n",
    "import os.path as pt\n",
    "from os import path\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import math\n",
    "\n",
    "import array\n",
    "import random\n",
    "from datetime import datetime, timedelta\n",
    "import shutil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Station</th>\n",
       "      <th>Date</th>\n",
       "      <th>Measurement</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>TA00096</td>\n",
       "      <td>2020-01-01 03:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>TA00096</td>\n",
       "      <td>2020-01-01 06:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>TA00096</td>\n",
       "      <td>2020-01-01 09:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>TA00096</td>\n",
       "      <td>2020-01-01 12:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>TA00096</td>\n",
       "      <td>2020-01-01 15:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111221</th>\n",
       "      <td>111221</td>\n",
       "      <td>TA00687</td>\n",
       "      <td>2020-12-31 09:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111222</th>\n",
       "      <td>111222</td>\n",
       "      <td>TA00687</td>\n",
       "      <td>2020-12-31 12:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111223</th>\n",
       "      <td>111223</td>\n",
       "      <td>TA00687</td>\n",
       "      <td>2020-12-31 15:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111224</th>\n",
       "      <td>111224</td>\n",
       "      <td>TA00687</td>\n",
       "      <td>2020-12-31 18:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111225</th>\n",
       "      <td>111225</td>\n",
       "      <td>TA00687</td>\n",
       "      <td>2020-12-31 21:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>111226 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0  Station                 Date  Measurement\n",
       "0                0  TA00096  2020-01-01 03:00:00            0\n",
       "1                1  TA00096  2020-01-01 06:00:00            0\n",
       "2                2  TA00096  2020-01-01 09:00:00            0\n",
       "3                3  TA00096  2020-01-01 12:00:00            0\n",
       "4                4  TA00096  2020-01-01 15:00:00            0\n",
       "...            ...      ...                  ...          ...\n",
       "111221      111221  TA00687  2020-12-31 09:00:00            0\n",
       "111222      111222  TA00687  2020-12-31 12:00:00            0\n",
       "111223      111223  TA00687  2020-12-31 15:00:00            0\n",
       "111224      111224  TA00687  2020-12-31 18:00:00            0\n",
       "111225      111225  TA00687  2020-12-31 21:00:00            0\n",
       "\n",
       "[111226 rows x 4 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data2019 = pd.read_csv('./Output_files/samples_2020.csv')\n",
    "data2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Station</th>\n",
       "      <th>Date</th>\n",
       "      <th>Measurement</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>76097</th>\n",
       "      <td>76097</td>\n",
       "      <td>TA00457</td>\n",
       "      <td>2020-12-31 09:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76098</th>\n",
       "      <td>76098</td>\n",
       "      <td>TA00457</td>\n",
       "      <td>2020-12-31 12:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76099</th>\n",
       "      <td>76099</td>\n",
       "      <td>TA00457</td>\n",
       "      <td>2020-12-31 15:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76100</th>\n",
       "      <td>76100</td>\n",
       "      <td>TA00457</td>\n",
       "      <td>2020-12-31 18:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76101</th>\n",
       "      <td>76101</td>\n",
       "      <td>TA00457</td>\n",
       "      <td>2020-12-31 21:00:00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       Unnamed: 0  Station                 Date  Measurement\n",
       "76097       76097  TA00457  2020-12-31 09:00:00            0\n",
       "76098       76098  TA00457  2020-12-31 12:00:00            0\n",
       "76099       76099  TA00457  2020-12-31 15:00:00            0\n",
       "76100       76100  TA00457  2020-12-31 18:00:00            0\n",
       "76101       76101  TA00457  2020-12-31 21:00:00            0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data2019.loc[(data2019['Station']=='TA00457')].tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from multiprocessing import cpu_count\n",
    "from multiprocessing.pool import ThreadPool\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "def sequence_date(iterable):\n",
    "    date, output_parent_dir, msg_parent_dir = iterable\n",
    "\n",
    "    data = pd.read_csv('./Output_files/samples_' + str(date.year) + '.csv', parse_dates=['Date'])\n",
    "\n",
    "    date_start = pd.Timestamp(date) # Starting at the time of the image\n",
    "    date_end = pd.Timestamp(date) + timedelta(hours=3) # Finishing2h 45 min later\n",
    "    \n",
    "    #1. Extract all stations that took data in a day and create folders\n",
    "    len_sts_unique = len(data.loc[data['Date']==date, 'Station'].unique())\n",
    "\n",
    "    print('DATE:', date)\n",
    "    print('stations:', data.loc[data['Date']==date, 'Station'].unique())\n",
    "    \n",
    "    for ix, st in enumerate(data.loc[data['Date']==date, 'Station'].unique()):\n",
    "        name_folder =  st + '_' + str(date_start.strftime('%Y.%m.%d_%H'))\n",
    "        \n",
    "        # MAKING THE FOLDERS TO STORE THE IMAGES\n",
    "        # if (pt.exists(output_parent_dir + name_folder) == 0): os.mkdir(output_parent_dir + name_folder)\n",
    "        \n",
    "        # POPULATE EACH FOLDER WITH THE RESPECTIVE FILE\n",
    "   \n",
    "        # # Timestamps of all 15 min files\n",
    "        # for minu in np.arange(12):\n",
    "        #     timestamp = date_start + timedelta(minutes=int(minu*15))\n",
    "    \n",
    "        #     # Convert that datetime to the format in the filenames of the MSG  images\n",
    "        #     str_search = str(timestamp.strftime('%Y%m%d_%H'))\n",
    "            \n",
    "        #     dir_st = msg_parent_dir+st\n",
    "        #     print('dir_st1:', dir_st)\n",
    "        #     directory = os.listdir(dir_st)\n",
    "        #     print('dir_st2:', dir_st)\n",
    "        #     ## Look for the files that have that string and keep them in a folder (different per hour)\n",
    "        #     for fname in directory:\n",
    "        #         if str_search in fname:\n",
    "        #             print('fname', fname)\n",
    "        #             #Copy the file to an output directory\n",
    "        #             shutil.copy(dir_st+'/'+fname, output_parent_dir + name_folder)\n",
    "\n",
    "\n",
    "def sequences_parallel(iterables):\n",
    "    print('Running in parallel')\n",
    "    cpus = cpu_count()\n",
    "    results = ThreadPool(cpus - 1).imap_unordered(sequence_date, iterables, chunksize=100)\n",
    "\n",
    "def sequences_create(year, channel):\n",
    "    data = pd.read_csv('./Output_files/samples_' + str(year) + '.csv', parse_dates=['Date'])\n",
    "\n",
    "    # Directory where the MSG images are (from here, they are divided in subfolders per station)\n",
    "    output_parent_dir = r'Q:/InputData/SAVANNA-MSG-32pix-' +str(year) + '-' + channel + '/'\n",
    "    if (pt.exists(output_parent_dir) == 0): os.mkdir(output_parent_dir)\n",
    "    msg_parent_dir = r'Q:/' + str(year) + '-MSG-32pix-' + channel + '/'\n",
    "\n",
    "    iterables = pd.DataFrame()\n",
    "    iterables['dates'] = data['Date'].unique()\n",
    "    iterables['output_parent_dir'] = [output_parent_dir] * len(iterables)\n",
    "    iterables['msg_parent_dir'] = [msg_parent_dir] * len(iterables)\n",
    "    \n",
    "    sequences_parallel(iterables.to_numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "YEAR:  2020\n",
      "CHANNEL: WV_073\n",
      "\t FINISHED CHANNEL WV_073\n",
      " -------------------------\n",
      "\t FINISHED YEAR 2020\n",
      " -------------------------\n"
     ]
    }
   ],
   "source": [
    "for year in [2020]:\n",
    "    print('YEAR: ', year)\n",
    "    for channel in ['WV_073']:\n",
    "        print('CHANNEL:', channel)\n",
    "        \n",
    "        data = pd.read_csv('./Output_files/samples_' + str(year) + '.csv', parse_dates=['Date'])\n",
    "        msg_parent_dir = r'E:/InputData/' + str(year) + '-MSG-32pix-' + channel + '/'\n",
    "        output_parent_dir = r'E:/InputData/SAVANNA-MSG-32pix-' + str(year) + '-' + channel + '/'\n",
    "        \n",
    "        if (pt.exists(output_parent_dir) == 0): os.mkdir(output_parent_dir)\n",
    "        \n",
    "        for ix, row in data.iterrows():\n",
    "            if ix>76100:\n",
    "            \n",
    "                st = row.Station\n",
    "                date_start = row.Date\n",
    "\n",
    "                # The folder from which we will take the images\n",
    "                source_dir = msg_parent_dir + st \n",
    "                if(pt.exists(source_dir)):\n",
    "                    # Folder where the images will be stored - one per row (identified with sequence and station). \n",
    "                    name_folder =  st + '_' + str(date_start.strftime('%Y.%m.%d_%H'))\n",
    "                    target_dir = output_parent_dir + name_folder \n",
    "                    if (pt.exists(target_dir) == 0): \n",
    "                        #if the sequence has not been processed yet, i.e. there is no folder, make the folder and take the files\n",
    "                        os.mkdir(target_dir)\n",
    "\n",
    "                        # Locate all the files in the source directory and copy them to the target directory\n",
    "\n",
    "                        for hrs in np.arange(3):\n",
    "                            timestamp = date_start + timedelta(hours=int(hrs))\n",
    "                            # Convert that datetime to the format in the filenames of the MSG  images\n",
    "                            str_search = str(timestamp.strftime('%Y%m%d_%H'))\n",
    "\n",
    "                            ## Look for the files that have that string and keep them in a folder (different per hour)\n",
    "                            for fname in os.listdir(source_dir):\n",
    "                                if str_search in fname:\n",
    "                                    #Copy the file to an output directory\n",
    "                                    shutil.copy(source_dir+'/'+fname, target_dir)\n",
    "                        \n",
    "        print(f'\\t FINISHED CHANNEL {channel}\\n -------------------------')\n",
    "        \n",
    "    print(f'\\t FINISHED YEAR {year}\\n -------------------------')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}