"""
File: model.py
Description: This script processes the raw data from the paper "A zone-based Wi-Fi fingerprinting 
             indoor positioning system for factory noise mapping" and evaluates machine learning 
             model performance as discussed in Section 4.3 ("ML Model Performance Evaluation"). 
             It analyzes model accuracy and effectiveness for indoor positioning and noise mapping 
             in a factory setting.

Author: L. Xiao (l.xiao@utwente.nl)
Date: March 7, 2025

License: This project is licensed under the MIT License - see the LICENSE file for details.

Copyright (c) 2025 L. Xiao
"""

import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import tensorflow as tf
import seaborn as sns
import time

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# K-folder cross-validation K=5 1/5 = 0.2
SPLIT_TEST_DATASET_PECENTAGE = 0.2

PATH_MODEL_KNN          = "./FingerPrintWifi/result_rssi/knn.pkl"
PATH_MODEL_GAUSSIAN_NB  = "./FingerPrintWifi/result_rssi/gaussian_nb.pkl"
PATH_MODEL_DT           = "./FingerPrintWifi/result_rssi/dt.pkl"
PATH_MODEL_MLP          = "./FingerPrintWifi/result_rssi/mlp.pkl"
PATH_MODEL_RF          = "./FingerPrintWifi/result_rssi/rf.pkl"
PATH_MODEL_SVM          = "./FingerPrintWifi/result_rssi/svg.pkl"

PATH_MODEL_NN           = "./FingerPrintWifi/result_rssi/nn_rssi.h5"
PATH_SCALER             = './FingerPrintWifi/result_rssi/scaler.pkl'
DATA_HEADER_ALL     = ["AP1", "AP2", "AP3", "AP4", "AP5", "AP6", "label"]
DATA_HEADER_RSSI    = ["AP1", "AP2", "AP3", "AP4", "AP5", "AP6"]

ROUND_DIGIT_2 = 2
ROUND_DIGIT_6 = 6
PATH_ZONE_SIZE = './FingerPrintWifi/result_rssi/model_performance_zonesize.csv'
PATH_SAMPLING_NUM = './FingerPrintWifi/result_rssi/model_performance_samples_num.csv'
PATH_AP_NUM = './FingerPrintWifi/result_rssi/model_performance_ap_num.csv'
PATH_RSSI_TYPE = './FingerPrintWifi/result_rssi/model_performance_rssi_type.csv'

PATH_NEIGHBOR_ZONE = './FingerPrintWifi/result_rssi/neighbor_zone_influ.csv'
MODEL_NAMES = ['KNN', 'MLP', 'DecisionTree', 'GaussianNB', 'RandomForest', 'SVM']


class MLModel:
    def __init__(self) -> None:
        # Folder save model and CSV result
        self.workpath = None
        self._model = None
        self.modeltype = "Unknown"
        # Must be the same scaler for train model
        self._scaler = None
        self.workpath = "./result_rssi"
        self.accuracy_list = []
        self.traintimes = []
        self.predtimes = []
        self.allow_neighbor = False
        self.neighbor_zones = {}
        # self.init_model("RF")

    def init_model(self, model_type):
        """
        "DT" "KNN" "GNB"
        """
        self.modeltype = model_type
        if model_type == "KNN":
            self.load_model(PATH_MODEL_KNN)
        elif model_type == "DT":
            self.load_model(PATH_MODEL_DT)
        elif model_type == "GNB":
            self.load_model(PATH_MODEL_GAUSSIAN_NB)
        elif model_type == "RF":
            self.load_model(PATH_MODEL_RF)
        elif model_type == "SVM":
            self.load_model(PATH_MODEL_SVM)
        elif model_type == "NN":
            self.load_model(PATH_MODEL_NN)
        else:
            print(f"Not supported type: {model_type}")

    def load_data(self, folder_path, save_sum=False):
        """
        Read all num.csv files in this folder, exclude the combined_data.csv file
        @param save_sum save the combine PDF in the same folder
        """
        df_list = []

        for filename in os.listdir(folder_path):
            if not filename.endswith(".csv") or filename == "combined_data.csv":
                continue
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path, header=None)
            label = os.path.splitext(filename)[0]
            if not label.isdigit():
                continue

            labels = [int(label)] * len(df)
            df["label"] = labels
            df_list.append(df)

        df_sum = pd.concat(df_list, ignore_index=True)
        df_sum.columns = DATA_HEADER_ALL
        
        if save_sum == True:
            save_path = os.path.join(folder_path, "combined_data.csv")
            df_sum.to_csv(save_path, index=False)
        print(f"Combined data total: {len(df_sum)} ")

        return df_sum
    
    def load_data2(self, csv_path):
        """
        load a combined csv file with all data included
        """
        df = pd.read_csv(csv_path)
        df.columns = DATA_HEADER_ALL
        return df

    def split(self,  df: pd.DataFrame):
        X = df.drop(columns=["label"])
        y = df["label"]
        return train_test_split(X, y, test_size=0.2, random_state=42)
    
    def split_scale(self,  df: pd.DataFrame):
        """
        Split and standardize the df
        """
        df_copy = df.copy(True)
        X = df_copy.drop(columns=["label"])
        y = df["label"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SPLIT_TEST_DATASET_PECENTAGE, random_state=42)

        self._scaler = StandardScaler()
        X_train_scaled = self._scaler.fit_transform(X_train)
        X_test_scaled = self._scaler.transform(X_test)

        return X_train_scaled, X_test_scaled, y_train, y_test
    
    def split_scale_nn(self,  df: pd.DataFrame):
        """
        Also one-hot address y
        """
        X = df.drop(columns=["label"])
        y = df["label"]
        y_encoded = tf.keras.utils.to_categorical(y, num_classes=40)

        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
        self._scaler = StandardScaler()
        X_train_scaled = self._scaler.fit_transform(X_train)
        X_test_scaled = self._scaler.transform(X_test)

        return X_train_scaled, X_test_scaled, y_train, y_test
    
    def save_model(self, filepath):
        """ 
        Not for tensorflow. Save the current trainned model and later reused
        """
        print(f"current model is saved to {filepath}")
        folder_path = os.path.dirname(filepath)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        with open(filepath, 'wb') as f:
            pickle.dump(self._model, f)
        with open(PATH_SCALER, 'wb') as f:
            pickle.dump(self._scaler, f)

    def load_model(self, filepath):
        """
        Load both model and scaler
        """
        print(f"load a local model at: {filepath}")
        with open(filepath, 'rb') as file:
            self._model = pickle.load(file)
        with open(PATH_SCALER, 'rb') as f:
            self._scaler = pickle.load(f)

    def load_NN(self, model_path):
        self._model = tf.keras.models.load_model(model_path)

    def run_KNN(self, X_train, X_test, y_train, y_test):
        """ 
        Adjust the neighbors numbers 3 4 5, won't change accuracy...
        """ 
        knn = KNeighborsClassifier(n_neighbors=3)
        start_time = time.perf_counter()
        knn.fit(X_train, y_train)
        training_time = (time.perf_counter() - start_time) * 1000

        start_time = time.perf_counter()
        y_pred = knn.predict(X_test)
        pred_time = (time.perf_counter() - start_time) * 1000
        accuracy = self.accuracy_score(y_test, y_pred)
        self._model = knn
        self.traintimes.append(round(training_time, ROUND_DIGIT_2))
        self.predtimes.append(round(pred_time / len(X_test), ROUND_DIGIT_6))
        print(f"KNN accuracy in testing dataset: {accuracy:.2f}\t tt: {training_time:.2f} \t pt: {pred_time / len(X_test):.6f} ms")
        return round(accuracy, ROUND_DIGIT_2)

    def run_GaussianNB(self, X_train, X_test, y_train, y_test):
        nb_classifier = GaussianNB()
        start_time = time.perf_counter()
        nb_classifier.fit(X_train, y_train)
        training_time = (time.perf_counter() - start_time) * 1000

        start_time = time.perf_counter()
        y_pred = nb_classifier.predict(X_test)
        pred_time = (time.perf_counter() - start_time) * 1000
        accuracy = self.accuracy_score(y_test, y_pred)
        self._model = nb_classifier
        self.traintimes.append(round(training_time, ROUND_DIGIT_2))
        self.predtimes.append(round(pred_time / len(X_test), ROUND_DIGIT_6))
        print(f"GaussianNB accuracy in testing dataset: {accuracy:.2f}\t tt: {training_time:.2f} \t pt: {pred_time / len(X_test):.6f} ms")
        return round(accuracy, ROUND_DIGIT_2)

    def run_DecisionTree(self, X_train, X_test, y_train, y_test):
        tree_classifier = DecisionTreeClassifier(random_state=42)
        start_time = time.perf_counter()
        tree_classifier.fit(X_train, y_train)
        training_time = (time.perf_counter() - start_time) * 1000

        start_time = time.perf_counter()
        y_pred = tree_classifier.predict(X_test)
        pred_time = (time.perf_counter() - start_time) * 1000
        accuracy = self.accuracy_score(y_test, y_pred)
        self._model = tree_classifier
        self.traintimes.append(round(training_time, ROUND_DIGIT_2))
        self.predtimes.append(round(pred_time / len(X_test), ROUND_DIGIT_6))
        print(f"DecisionTree accuracy in testing dataset: {accuracy:.2f}\t tt: {training_time:.2f} \t pt: {pred_time / len(X_test):.6f} ms")
        return round(accuracy, ROUND_DIGIT_2)

    def run_SVM(self, X_train, X_test, y_train, y_test):
        svm_classifier = SVC(random_state=42)
        start_time = time.perf_counter()
        svm_classifier.fit(X_train, y_train)
        training_time = (time.perf_counter() - start_time) * 1000

        start_time = time.perf_counter()
        y_pred = svm_classifier.predict(X_test)
        pred_time = (time.perf_counter() - start_time) * 1000
        accuracy = self.accuracy_score(y_test, y_pred)
        self._model = svm_classifier
        self.traintimes.append(round(training_time, ROUND_DIGIT_2))
        self.predtimes.append(round(pred_time / len(X_test), ROUND_DIGIT_6))
        print(f"SVM accuracy in testing dataset: {accuracy:.2f}\t tt: {training_time:.2f} \t pt: {pred_time / len(X_test):.6f} ms")
        return round(accuracy, ROUND_DIGIT_2)

    def run_RandomForest(self, X_train, X_test, y_train, y_test):
        rf_classifier = RandomForestClassifier(random_state=42)
        start_time = time.perf_counter()
        rf_classifier.fit(X_train, y_train)
        training_time = (time.perf_counter() - start_time) * 1000
        
        start_time = time.perf_counter()
        y_pred = rf_classifier.predict(X_test)
        pred_time = (time.perf_counter() - start_time) * 1000
        accuracy = self.accuracy_score(y_test, y_pred)
        self._model = rf_classifier
        self.traintimes.append(round(training_time, ROUND_DIGIT_2))
        self.predtimes.append(round(pred_time / len(X_test), ROUND_DIGIT_6))
        print(f"RandomForest accuracy in testing dataset: {accuracy:.2f}\t tt: {training_time:.2f} \t pt: {pred_time / len(X_test):.6f} ms")
        return round(accuracy, ROUND_DIGIT_2)

    def run_MLP(self, X_train, X_test, y_train, y_test):
        """
        A simplified type of NN, like tensorflow NN. Full connected, cnn is only partial connect and RNN
        suit small dataset
        """
        # (50, 25, 10) (10, 5) 2000
        model = MLPClassifier(hidden_layer_sizes=(64, 128, 32), activation='relu', max_iter=4000, learning_rate='adaptive', random_state=42)
        start_time = time.perf_counter()
        model.fit(X_train, y_train)
        training_time = (time.perf_counter() - start_time) * 1000
        start_time = time.perf_counter()
        y_pred = model.predict(X_test)
        pred_time = (time.perf_counter() - start_time) * 1000

        accuracy = self.accuracy_score(y_test, y_pred)
        self._model = model
        self.traintimes.append(round(training_time, ROUND_DIGIT_2))
        self.predtimes.append(round(pred_time / len(X_test), ROUND_DIGIT_6))
        print(f"MLP accuracy in testing dataset: {accuracy:.2f}\t tt: {training_time:.2f} \t pt: {pred_time / len(X_test):.6f} ms")
        return round(accuracy, ROUND_DIGIT_2)

    def run_LDA(self, X_train, X_test, y_train, y_test):
        """
        !Normal performance
        Paper 'Indoor Multifloor Localization Method Based on WiFi Fingerprints and LDA'
        prove best accuracy, also good in iridis
        """
        lda = LinearDiscriminantAnalysis()
        lda.fit(X_train, y_train)
        y_pred = lda.predict(X_test)

        accuracy = self.accuracy_score(y_test, y_pred)
        self._model = model
        self.accuracy_list.append(round(accuracy, ROUND_DIGIT_2))
        print(f"LDA accuracy in testing dataset: {accuracy:.2f}")

    def run_NN(self, X_train, X_test, y_train, y_test, to_save=False):
        model = tf.keras.Sequential()
        model.add(tf.keras.Input(shape=(X_train.shape[1],)))
        model.add(tf.keras.layers.Dense(units=64, activation='relu'))
        model.add(tf.keras.layers.Dense(units=40, activation='softmax')) # 输出层，39个类别 0

        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        model.summary()
        # epochs vital, see accuracy is still increasing, then should 
        model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

        loss, accuracy = model.evaluate(X_test, y_test)
        self._model = model
        y_predict = model.predict(X_test)
        self.accuracy_list.append(round(accuracy, ROUND_DIGIT_2))
        print(f"Neutron network accuracy in testing dataset: {accuracy:.2f}")
        if to_save == True:
            # savepath = os.path.join(self.workpath, "nn_rssi.h5")
            model.save(PATH_MODEL_NN)

    def predict(self, X):
        """
        support list [,] and [[,,], ]
        @return label or label list
        """
        X = np.array(X)
        if X is None or len(X) == 0:
            return None
        islist = True
        if X.ndim == 1:
            islist = False
            X = np.reshape(X, (1, -1))

        X_df = pd.DataFrame(X, columns=DATA_HEADER_RSSI)
        X_scaled = self._scaler.transform(X_df)
        y_pred = self._model.predict(X_scaled)
        if islist:
            return y_pred
        else:
            return y_pred[0]
    
    def predict_scaled(self, X):
        """ 
        @param X is already standardized!
        @return a list. list[0]
        """
        if X.ndim == 1:
            X = np.reshape(X, (1, -1))
        y_pred = self._model.predict(X)
        # label = np.argmax(y_pred)
        return y_pred

    def visualize(self, df: pd.DataFrame):
        """
        scatterplot two APs data border
        """
        ax = sns.scatterplot(data=df, x="AP1", y="AP3", hue="label")
        for i, point in df.iterrows():
            ax.text(point["AP1"], point["AP2"], str(point["label"]))
        # sns.heatmap(df.corr(), annot=True)
        plt.show()

    def accuracy_score(self, y_true, y_pred):
        if len(y_true) != len(y_pred):
            raise ValueError("The length of y_true and y_pred must be the same.")
        if len(y_true) == 0:
            return 0.0
        
        correct_predictions = 0
        for y_t, y_p in zip(y_true, y_pred):
            if y_t == y_p:
                correct_predictions += 1
            else:
                if self.allow_neighbor == True:
                    if y_t in self.neighbor_zones and y_p in self.neighbor_zones[y_t]:
                        correct_predictions += 1
        
        accuracy = correct_predictions / len(y_true)
        return accuracy

    def init_nearbyzone(self):
        """
        Read the eight directional nearby zones for each zone from the configuration file. 
        This will modify the accuracy criteria, 
        where the nearby zones in all eight directions will also be counted as correctly predicted.
        """
        csv_file_path = r'FingerPrintWifi/config/zone39.csv'
        matrix = pd.read_csv(csv_file_path, header=None).values
        rows, cols = matrix.shape
        result = {}

        # Directional Vectors: Up, Down, Left, Right, Top-Left, Top-Right, Bottom-Left, Bottom-Right
        directions = [(-1, 0), (1, 0), (0, -1), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)]

        for r in range(rows):
            for c in range(cols):
                key = matrix[r, c]
                result[key] = []
                for dr, dc in directions:
                    nr, nc = r + dr, c + dc
                    if 0 <= nr < rows and 0 <= nc < cols:
                        result[key].append(matrix[nr, nc])
                    else:
                        result[key].append(-1)
        self.allow_neighbor = True
        self.neighbor_zones = result
        return result

    def plot_confmatrix(self, y_test, y_pred):
        from sklearn.metrics import confusion_matrix
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.show()


############################# TEST CASES #############################

def Test_predict(model: MLModel, df: pd.DataFrame):
    # model.load_model(PATH_MODEL_GAUSSIAN_NB)
    model.init_model("DT")
    z = model.predict([-39,-48,-48,-54,-74,-76])
    print("pred =", z, "; real = 0")
    z = model.predict([-50,-57,-39,-40,-72,-82])
    print("pred =", z, "; real = 11")
    z = model.predict([-51,-63,-38,-43,-75,-81])
    print("pred =", z, "; real = 14")

    # y_predict = model.predict_scaled(X_test_scaled)

    X_train_scaled, X_test_scaled, y_train, y_test = model.split(df)
    y_predict = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_predict)
    print(f"Accuracy: {accuracy:.2f}")
    print(f"pred: \n{y_predict[:24]}")
    print(f"real: \n{y_test.to_numpy()[:24]}")

def Test_NN(model: MLModel, df: pd.DataFrame):
    X_train_scaled, X_test_scaled, y_train, y_test = model.split_scale_nn(df)
    model.run_NN(X_train_scaled, X_test_scaled, y_train, y_test, to_save=True)
    model.save_model(PATH_MODEL_NN)

    # modelpath = os.path.join(model.workpath, "nn_rssi.h5")
    model.load_NN(PATH_MODEL_NN)
    print("X1=", X_test_scaled, " y1=", y_test, " y_predict=", model.predict_scaled( X_test_scaled))

def Test_modeltrain_nocv(model: MLModel, df: pd.DataFrame):
    """
    ! No K-folder cross validation
    @return list of accuracy [knn, mlp, dt, gnb, rf]
    Able to print the run time
    """
    # model.accuracy_list.clear()
    model.accuracy_list = []
    X_train_scaled, X_test_scaled, y_train, y_test = model.split_scale(df)

    model.run_KNN(X_train_scaled, X_test_scaled, y_train, y_test)
    model.run_MLP(X_train_scaled, X_test_scaled, y_train, y_test)
    model.run_DecisionTree(X_train_scaled, X_test_scaled, y_train, y_test)
    model.run_GaussianNB(X_train_scaled, X_test_scaled, y_train, y_test)
    model.run_RandomForest(X_train_scaled, X_test_scaled, y_train, y_test)
    model.run_SVM(X_train_scaled, X_test_scaled, y_train, y_test)
    # model.save_model(PATH_MODEL_SVM)
    # model.save_model(PATH_MODEL_KNN)
    # model.save_model(PATH_MODEL_MLP)
    # model.save_model(PATH_MODEL_DT)
    # model.save_model(PATH_MODEL_GAUSSIAN_NB)
    # model.save_model(PATH_MODEL_RF)
    # model.run_LDA(X_train_scaled, X_test_scaled, y_train, y_test)
    # print(f"KNN hyperparameters: {model._model.get_params()}")
    # print(f"MLP hyperparameters: {model._model.get_params()}")
    # print(f"DT hyperparameters: {model._model.get_params()}")
    # print(f"GNB hyperparameters: {model._model.get_params()}")
    # print(f"RF hyperparameters: {model._model.get_params()}")
    # print(f"SVM hyperparameters: {model._model.get_params()}")
    # print(model.accuracy_list)
    return model.accuracy_list


def Test_modeltrain(model: MLModel, df: pd.DataFrame, k_folds=5):
    """
    Runs cross-validation on multiple ML models and returns a list of average accuracies.
    Uses K-Fold Cross-Validation instead of a single train-test split.
    
    @return list of average accuracy [knn, mlp, dt, gnb, rf, svm]
    """
    model.accuracy_list = []

    df_copy = df.copy(True)
    X = df_copy.drop(columns=["label"])
    y = df["label"]
    scaler = StandardScaler()

    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
    
    results = {
        "KNN": [],
        "MLP": [],
        "DecisionTree": [],
        "GaussianNB": [],
        "RandomForest": [],
        "SVM": []
    }

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)

        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        results["KNN"].append(model.run_KNN(X_train, X_test, y_train, y_test))
        results["MLP"].append(model.run_MLP(X_train, X_test, y_train, y_test))
        results["DecisionTree"].append(model.run_DecisionTree(X_train, X_test, y_train, y_test))
        results["GaussianNB"].append(model.run_GaussianNB(X_train, X_test, y_train, y_test))
        results["RandomForest"].append(model.run_RandomForest(X_train, X_test, y_train, y_test))
        results["SVM"].append(model.run_SVM(X_train, X_test, y_train, y_test))

    avg_accuracies = {model: round(np.mean(acc_list), 3) for model, acc_list in results.items()}

    print("\n=== Average Accuracies from K-Fold Cross-Validation ===")
    for model, acc in avg_accuracies.items():
        print(f"{model}: {acc:.3f}")

    return list(avg_accuracies.values())

def Test_modeltime():
    """
    Give model training time sumup and predicting time sum
    Run in 'High Performance' mode under Windows setting
    """
    traintime_table = []
    predtime_table = []
    for i in range(20):
        model = MLModel()
        model.workpath = "./result_rssi"
        data_path = r"FingerPrintWifi\PubData\RSSI_rawdata_4days_sample200_37zone.csv"
        df = model.load_data2(data_path)
        Test_modeltrain_nocv(model, df)
        traintime_table.append(model.traintimes)
        predtime_table.append(model.predtimes)
        del model
    df2 = pd.DataFrame(traintime_table)
    mean_values = df2.mean()
    df2.loc['Average'] = mean_values.round(ROUND_DIGIT_6)
    df2.to_csv(r"./FingerPrintWifi/result_rssi/traintime_table.csv", index=False, header=False)
    df3 = pd.DataFrame(predtime_table)
    mean_values = df3.mean()
    df3.loc['Average'] = mean_values.round(ROUND_DIGIT_6)
    df3.to_csv(r"./FingerPrintWifi/result_rssi/predtime_table.csv", index=False, header=False)

def Test_re_rssi(model: MLModel, df: pd.DataFrame):
    record = []
    abs_rssi = Test_modeltrain(model, df)
    record.append(abs_rssi)

    df_copy = df.copy(deep=True)
    df_copy['delta1'] = df_copy['AP2'] - df_copy['AP1']
    df_copy['delta2'] = df_copy['AP3'] - df_copy['AP2']
    df_copy['delta3'] = df_copy['AP4'] - df_copy['AP3']
    df_copy['delta4'] = df_copy['AP5'] - df_copy['AP4']
    df_copy['delta5'] = df_copy['AP6'] - df_copy['AP5']
    new_df = df_copy[['delta1', 'delta2', 'delta3', 'delta4', 'delta5', 'label']]
    re_rssi = Test_modeltrain(model, new_df)
    record.append(re_rssi)

    array_names = ["absolute rssi", "relative rssi"]
    df = pd.DataFrame(record, columns=MODEL_NAMES)
    df.insert(0, 'RSSIType', array_names)
    df.to_csv(PATH_RSSI_TYPE, index=False)

def _zonearea2(model: MLModel, df: pd.DataFrame):
    """
    The two zones are merged into one larger zone, resulting in a total of 18 zones (0-17). 
    Each zone has an area of 6x6 = 36 m².
    """
    merge_config = pd.read_csv('FingerPrintWifi/config/zone2merge.csv')

    df_copy = df.copy(deep=True)
    indices_to_drop = df_copy[df_copy['label'].isin([12, 13, 38])].index
    df_dropped = df_copy.drop(indices_to_drop)
    
    for index, row in merge_config.iterrows():
        label1 = row['label1']
        label2 = row['label2']
        merged_label = row['merged_label']
        df_dropped.loc[df['label'].isin([label1, label2]), 'label'] = merged_label
    return Test_modeltrain(model, df_dropped)

def _zonearea3(model: MLModel, df: pd.DataFrame):
    merge_config = pd.read_csv('FingerPrintWifi/config/zone3merge.csv')
    df_copy = df.copy(deep=True)
    indices_to_drop = df_copy[df_copy['label'].isin([12, 13, 38])].index
    df_dropped = df_copy.drop(indices_to_drop)

    for index, row in merge_config.iterrows():
        df_dropped.loc[df['label'].isin(row[:3]), 'label'] = row['merged_label']
    return Test_modeltrain(model, df_dropped)

def _zonearea4(model: MLModel, df: pd.DataFrame):
    """
    The four zones are merged into one larger zone, resulting in a total of 9 zones (0-8). 
    Each zone has an area of 6x6 = 36 m², and the combined area is 36 m² × 2 = 72 m².
    """
    merge_config = pd.read_csv('FingerPrintWifi/config/zone4merge.csv')
    df_copy = df.copy(deep=True)
    indices_to_drop = df_copy[df_copy['label'].isin([12, 13, 38])].index
    df_dropped = df_copy.drop(indices_to_drop)

    for index, row in merge_config.iterrows():
        df_dropped.loc[df['label'].isin(row[:4]), 'label'] = row['merged_label']
    return Test_modeltrain(model, df_dropped)


def _zonearea6(model: MLModel, df: pd.DataFrame):
    merge_config = pd.read_csv('FingerPrintWifi/config/zone6merge.csv')
    df_copy = df.copy(deep=True)
    indices_to_drop = df_copy[df_copy['label'].isin([12, 13, 38])].index
    df_dropped = df_copy.drop(indices_to_drop)

    for index, row in merge_config.iterrows():
        df_dropped.loc[df['label'].isin(row[:6]), 'label'] = row['merged_label']
    return Test_modeltrain(model, df_dropped)


def _zonearea12(model: MLModel, df: pd.DataFrame):
    merge_config = pd.read_csv('FingerPrintWifi/config/zone12merge.csv')
    df_copy = df.copy(deep=True)
    indices_to_drop = df_copy[df_copy['label'].isin([12, 13, 38])].index
    df_dropped = df_copy.drop(indices_to_drop)

    for index, row in merge_config.iterrows():
        label_list = row[:12]
        merged_label = row['merged_label']
        df_dropped.loc[df['label'].isin(label_list), 'label'] = merged_label
    return Test_modeltrain(model, df_dropped)

def plot_zonesize(filepath):
    df = pd.read_csv(filepath)
    # 10,6 (8,6) (6,6)
    plt.figure(figsize=(6, 6))
    plt.grid(True, which='both', linestyle='--', linewidth=0.5, color='gray')    
    line_styles = ['-', '--', '-.', ':', '-', '--']
    markers = ['o', 's', 'D', '^', 'v', 'p']

    for i, model in enumerate(df.columns[1:]):
        plt.plot(df['ZoneSize'], df[model], linestyle=line_styles[i % len(line_styles)],
                marker=markers[i % len(markers)], label=model)

    plt.xticks(df['ZoneSize'])
    plt.title('Model Accuracy vs Zone Size')
    plt.xlabel('Zone Size ($m^2$)')
    plt.ylabel('Accuracy')
    plt.gca().invert_xaxis()
    plt.legend()
    plt.savefig(r"./FingerPrintWifi/result_rssi/accuracy_zonesize.svg", format='svg', bbox_inches='tight')
    # plt.show()

def plot_samplenum(filepath):
    df = pd.read_csv(filepath)
    plt.figure(figsize=(6, 6))
    plt.grid(True, which='both', linestyle='--', linewidth=0.5, color='gray')    
    line_styles = ['-', '--', '-.', ':', '-', '--']
    markers = ['o', 's', 'D', '^', 'v', 'p']

    for i, model in enumerate(df.columns[1:]):
        plt.plot(df['SampleNum'], df[model], linestyle=line_styles[i % len(line_styles)],
                marker=markers[i % len(markers)], label=model)
    plt.xticks(df['SampleNum'])
    plt.title('Model Accuracy vs zone Sampling Number')
    plt.xlabel('Samples, pc per zone')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig(r"./FingerPrintWifi/result_rssi/accuracy_samples.svg", format='svg', bbox_inches='tight')
    # plt.show()

def plot_apnum(filepath):
    df = pd.read_csv(filepath)
    plt.figure(figsize=(6, 6))
    plt.grid(True, which='both', linestyle='--', linewidth=0.5, color='gray')    
    line_styles = ['-', '--', '-.', ':', '-', '--']
    markers = ['o', 's', 'D', '^', 'v', 'p']

    for i, model in enumerate(df.columns[1:]):
        plt.plot(df['APNums'], df[model], linestyle=line_styles[i % len(line_styles)],
                marker=markers[i % len(markers)], label=model)
    plt.xticks(df['APNums'])
    plt.title('Model Accuracy vs zone Access point Number')
    plt.xlabel('AP Number: pc')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig(r"./FingerPrintWifi/result_rssi/accuracy_aps.svg", format='svg', bbox_inches='tight')
    # plt.show()

def Test_zonesize(model: MLModel, df: pd.DataFrame):
    record = []
    a12 = _zonearea12(model, df)
    record.append(a12)
    a6 = _zonearea6(model, df)
    record.append(a6)
    a4 = _zonearea4(model, df)
    record.append(a4)
    a3 = _zonearea3(model, df)
    record.append(a3)
    a2 = _zonearea2(model, df)
    record.append(a2)
    a1 = Test_modeltrain(model, df)
    record.append(a1)

    zone_sizes = [216, 108, 64, 48, 36, 18]
    df = pd.DataFrame(record, columns=MODEL_NAMES)
    df.insert(0, 'ZoneSize', zone_sizes)
    csv_file_path = PATH_ZONE_SIZE
    df.to_csv(csv_file_path, index=False)
    plot_zonesize(csv_file_path)


def Test_samplenum(model: MLModel, df: pd.DataFrame):
    """
    From the 100 data points of each label in the dataframe, select 10, 20, 30, ..., up to 100. The goal is to determine the minimum number of sampling points.
    """
    proportions = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
    unique_labels = df['label'].unique()
    label_counts = df['label'].value_counts()
    sample_nums = [p * label_counts[0] for p in proportions]
    dfs = {p: pd.DataFrame() for p in proportions}

    for label in unique_labels:
        df_label = df[df['label'] == label]        
        for p in proportions:
            df_sampled = df_label.sample(frac=p, random_state=1)
            dfs[p] = pd.concat([dfs[p], df_sampled])
    record = []
    for p, df_sampled in dfs.items():
        a = Test_modeltrain(model, df_sampled)
        record.append(a)
    print("sampling points accuracy ", record)
    df = pd.DataFrame(record, columns=MODEL_NAMES)
    df.insert(0, 'SampleNum', sample_nums)
    df.to_csv(PATH_SAMPLING_NUM, index=False)
    plot_samplenum(PATH_SAMPLING_NUM)

def Test_apnum(model: MLModel, df: pd.DataFrame):
    """
    Get AP1 AP2 AP3 - AP4 -AP5 -AP6 from df, Aims to explore the minimun AP number
    """
    columns = ['AP1', ]

    record = []
    df_list = []
    for i in range(2, 7):
        columns.append(f'AP{i}')
        new_df = df[columns + ['label']]
        a = Test_modeltrain(model, new_df)
        record.append(a)
    print("sampling points accuracy ", record)
    df = pd.DataFrame(record, columns=MODEL_NAMES)
    df.insert(0, 'APNums', range(2, 7))
    df.to_csv(PATH_AP_NUM, index=False)
    plot_apnum(PATH_AP_NUM)


def accuracy_compare(data1, data2):
    improvement = [round(d2 - d1, ROUND_DIGIT_2) for d1, d2 in zip(data1, data2)]
    df = pd.DataFrame({
        'Model': MODEL_NAMES,
        'Accuracy (exact)': data1,
        'Accuracy (plus neighboring zones)': data2,
        'Improvement': improvement
    })
    df_transposed = df.set_index('Model').T

    print(df_transposed)
    df_transposed.to_csv(PATH_NEIGHBOR_ZONE)
    print(f"CSV Save in: {PATH_NEIGHBOR_ZONE}")

def neighbor_influence(model: MLModel, df: pd.DataFrame):
    accuracy_exact = Test_modeltrain(model, df)
    model.init_nearbyzone()
    accuracy_nearby = Test_modeltrain(model, df)
    accuracy_compare(accuracy_exact, accuracy_nearby)

if __name__ == "__main__":
    model = MLModel()
    model.workpath = "./FingerPrintWifi/result_tmp"
    df = model.load_data2(r"./RSSI_rawdata_4days_sample200_37zone.csv")

    # !Please run each test seperately, namely only one Test_xx function is execuated one time  
    Test_modeltrain(model, df)
    # Test_re_rssi(model, df)
    # neighbor_influence(model, df)
    # Test_zonesize(model, df)
    # Test_samplenum(model, df)
    # Test_apnum(model, df)
    # Test_modeltime()
    




