% This script applies standard oversampling to the borderline samples

% k             = Number of nearest neighbours
% Beta          = The ratio to which the classes are balanced

function [Ground_Truth_Data_list_total, Feature_data_matrix_total, number_oversampling_list] = Borderline_Oversampler(Ground_truth_data_list, class_values, Feature_data_matrix, k, Beta, number_classes, number_samples_list, class_samples_cell, number_features)
    %%% Inputs %%%
        % The maximum allowed distance between neighbouring samples. As the features are normalised, making this 1e2 for instance will make it not apply
        distance_threshold  = 0100;      
        
        % Whether the maximum distance between two features or the Euclidean distance is used
        distance_type       = 'Euclidean';      % [Euclidean, Max]        

    %%% Oversample the classes %%%
        Feature_data_cell = cell(1, number_classes);
        Class_data_cell = cell(1, number_classes);
        
        number_oversampling_list = zeros(1, number_classes);
    
        for c = 1 : number_classes
            class_value = class_values(c);
                        
            % Check if this class needs to be oversampled
            number_samples = number_samples_list(c);
            class_imbalance = number_samples / max(number_samples_list);
            
            % If there is only one class sample, it doesn't have any neighbours and cannot be oversampled through SMOTE
            if number_samples < 2
                continue
            end
            
            % The class samples are put in random order
            class_samples = class_samples_cell{c};
            random_order = randperm(number_samples);
            class_samples = class_samples(random_order);
            
            % Number of samples that need to be generated through oversampling
            number_oversamples_class = round((Beta - class_imbalance) * max(number_samples_list));
            
            % Detect borderline samples
            borderline_indices_list = zeros(1, number_samples);
            neighbour_indices_cell = cell(1, number_samples);

            for i = 1 : number_samples                    
                % The feature data belonging to this sample        
                sample = class_samples(i);
                Feature_data_sample = Feature_data_matrix(sample, :);

                % The distance to other samples
                distance_matrix = (Feature_data_matrix - Feature_data_sample).^2;

                % The maximum or Euclidean distance is used
                if strcmp(distance_type, 'Euclidean')
                    distance_list = sum(distance_matrix, 2);
                elseif strcmp(distance_type, 'Max')
                    distance_list = max(distance_matrix, [], 2);
                end

                distance_list = sqrt(distance_list);

                % Find k nearest neighbours, that don't exceed the distance threshold
                [distance_list_sorted, ind] = sort(distance_list, 'ascend');

                ind_distance = distance_list_sorted > distance_threshold;
                ind(ind_distance) = [];

                number_neighbours = min([number_samples - 1, length(ind) - 1, k]);      % The number of samples can't be exceeded
                neighbours = ind(2 : number_neighbours + 1);                            % The first index corresponds to itself, so it is not included

                % Determine how many of these neighbours belong to the same class
                neighbours_classes = Ground_truth_data_list(neighbours);

                neighbours_same_class_ind = neighbours_classes == class_value;
                neighbours_same_class = neighbours(neighbours_same_class_ind);

                number_same_class = length(neighbours_same_class);
                percentage_same_class = number_same_class / number_neighbours * 100;

                % Samples with no similar neighbours are considered outliers
                % Samples with more than half of the nearest neighbours being the same class are not considered borderline cases
                if percentage_same_class > 0 & percentage_same_class < 50
                    borderline_indices_list(i) = 1;
                    neighbour_indices_cell{i} = neighbours_same_class;
                end
            end

            % The number of borderline samples
            borderline_indices = find(borderline_indices_list == 1);
            neighbour_indices_cell = neighbour_indices_cell(borderline_indices);
            number_borderline_samples = length(borderline_indices);

            % List containing the number of oversamples each sample must generate
            number_oversamples_list = floor(number_oversamples_class / number_borderline_samples) * ones(1, number_borderline_samples);

            if number_borderline_samples > 0
                oversamples_remainder = mod(number_oversamples_class, number_borderline_samples);
                number_oversamples_list(1 : oversamples_remainder) = number_oversamples_list(1 : oversamples_remainder) + 1;
            end
            
            % Oversample the borderline samples
            Feature_data_oversampling_cell = cell(1, number_borderline_samples);

            for b = 1 : number_borderline_samples
                % This sample's feature data
                borderline_ind = borderline_indices(b);
                borderline_sample = class_samples(borderline_ind);
                Feature_data_sample = Feature_data_matrix(borderline_sample, :);

                % The same-class neighbours
                neighbours = neighbour_indices_cell{b};
                number_neighbours = length(neighbours);

                % Generate the oversamples
                number_oversamples = number_oversamples_list(b);

                if number_oversamples == 0      % If it doesn't need to generate any, continuing is unecessary
                    break
                end

                Feature_data_oversamples = zeros(number_oversamples, number_features);

                for o = 1 : number_oversamples
                    % This neighbour's data
                    neighbour_ind = mod(o - 1, number_neighbours) + 1;      % The neighbours may need to be cycled through
                    neighbour = neighbours(neighbour_ind);
                    Feature_data_neighbour = Feature_data_matrix(neighbour, :);

                    % Random normalised distance between the two samples
                    distance = rand(1);

                    % Feature data of the oversample
                    Feature_data_oversample = Feature_data_sample + distance * (Feature_data_neighbour - Feature_data_sample);

                    Feature_data_oversamples(o, :) = Feature_data_oversample;
                end 

                % Append the data
                Feature_data_oversampling_cell{b} = Feature_data_oversamples; 
            end

            % If no borderline samples exist, no oversamples were generated
            if number_borderline_samples == 0
                number_oversamples_class = 0;
            end
            
            number_oversampling_list(c) = number_oversamples_class;
            
            % Append the oversampled data
            Feature_data_matrix_oversampling = vertcat(Feature_data_oversampling_cell{:});
            Feature_data_cell{c} = Feature_data_matrix_oversampling;
            
            Class_data_list_oversampling = class_values(c) * ones(number_oversamples_class, 1);
            Class_data_cell{c} = Class_data_list_oversampling;
        end
        
        % Combine the oversampled feature and class data
        Oversampled_feature_data = vertcat(Feature_data_cell{:});        
        Oversampled_classes_data = vertcat(Class_data_cell{:});
        
        % The total data
        Ground_Truth_Data_list_total = [Ground_truth_data_list; Oversampled_classes_data];
        Feature_data_matrix_total = [Feature_data_matrix; Oversampled_feature_data];
end