% The aim is to cluster the data in such a way, that it is still suitable for classification
% The optimisation objective therefore consists of two parts:
%   1. Maximise the minimum area of the land masses within the chunk (avoid having numerous small slivers of land)
%   2. Maximise the ratio of valid to invalid data

% Cluster_Optimisation  = [On, Off], if set to Off chunks are evenly spaced, otherwise the partitions are optimised for

% Additionally, it provides the source of the data (Copernicus, Landsat, MODIS), given the data choice

function [rows_chunks, columns_chunks, rows_data, columns_data, R_CLC, zero_string, Data_source] = Smart_Clusterer(Cluster_Optimisation, Data_choices, number_chunks_x, number_chunks_y)

    %%% Inputs %%%
        % Allowed deviation from the default
        allowed_change = 15;        % [%]
        
        % Coarsening factor, the final results will be in the native resolution
        coarsening_factor = 10;     % [-]
        
    %%% Chunk identification %%%
        number_digits = ceil(log10(max(number_chunks_x, number_chunks_y) + 1));     % The number of digits is used to append 0 at the start of the numbers, for clarity    
        zero_string = sprintf('%%0%g.f', number_digits);                            % The string used to append these zeroes
        
    %%% Retrieve the land cover data %%%
        % Land cover data
        CLC_file_list = dir('CLC_*.tif');
        CLC_file = CLC_file_list(1).name;
        [CLC_data, R_CLC] = geotiffread(CLC_file);
        
        % The size of the data
        [rows_data, columns_data] = size(CLC_data);
        
    %%% Determine which LC data is missing %%%        
        % Retrieve the data to determine the water/land pixels
        NDVI_Data = Data_choices{1};
        NDVI_Component_Data = Data_choices{2};
        
        LAI_Data = Data_choices{3};
        FAPAR_Data = Data_choices{4};
        Fcover_Data = Data_choices{5};
        
        Spectral_Data = Data_choices{6};
        NIR_band = Spectral_Data{4};

       % Determine if the Landsat data source is used
        number_bands = length(Spectral_Data);
            
        for b = 1 : number_bands
            if strcmp(Spectral_Data{b}, 'Yes')
                Data_source = 'Landsat';

                continue
            end
        end 

        % The following data types use one specific value to flag water pixels
        margin = 1e-4;  % To compensate for Matlab rounding

        if strcmp(NDVI_Data, 'Yes')
            % Copernicus
            NDVI_file_list = dir('NDVI_*.tiff');
            Water_value = 254;
            
            Data_source = 'Copernicus';
            
            % MODIS
            if isempty(NDVI_file_list)
                NDVI_file_list = dir('MOD13Q1_*.tiff');
                Water_value = -3000;
                
                Data_source = 'MODIS';
            end
            
            NDVI_file = NDVI_file_list(1).name;
            Data = geotiffread(NDVI_file);  
            
            land_indices = find(Data < Water_value - margin | Data > Water_value + margin);
            water_indices = Data > Water_value - margin & Data < Water_value + margin;
            
        elseif strcmp(NDVI_Component_Data, 'Yes')
            % Copernicus
            Trend_LB_file = dir('NDVI_T_LB*.mat').name;
            Water_value = 254 / 250 - 0.08;
            
            Data_source = 'Copernicus';

            % MODIS
            if isempty(Trend_LB_file)
                Trend_LB_file = dir('MOD13Q1_T_LB*.mat').name;
                Water_value = 0;
                
                Data_source = 'MODIS';
            end
            
            Trend_LB_file = load(Trend_LB_file);
            Data = Trend_LB_file.T_LB_matrix;
            
            land_indices = find(Data < Water_value - margin | Data > Water_value + margin);
            water_indices = Data > Water_value - margin & Data < Water_value + margin;
            
        % Copernicus
        elseif strcmp(LAI_Data, 'Yes')
            LAI_file_list = dir('LAI_*.tiff');
            LAI_file = LAI_file_list(1).name;
            Data = geotiffread(LAI_file);
            
            Water_value = 255;
            
            land_indices = find(Data < Water_value - margin | Data > Water_value + margin);
            water_indices = Data > Water_value - margin & Data < Water_value + margin;
            
            Data_source = 'Copernicus';
        elseif strcmp(FAPAR_Data, 'Yes')
            FAPAR_file_list = dir('FAPAR_*.tiff');
            FAPAR_file = FAPAR_file_list(1).name;
            Data = geotiffread(FAPAR_file);
            
            Water_value = 255;
            
            land_indices = find(Data < Water_value - margin | Data > Water_value + margin);
            water_indices = Data > Water_value - margin & Data < Water_value + margin;
            
            Data_source = 'Copernicus';
        elseif strcmp(Fcover_Data, 'Yes')
            Fcover_file_list = dir('FCOVER_*.tiff');
            Fcover_file = Fcover_file_list(1).name;
            Data = geotiffread(Fcover_file);
            
            Water_value = 255;
            
            land_indices = find(Data < Water_value - margin | Data > Water_value + margin);
            water_indices = Data > Water_value - margin & Data < Water_value + margin;
            
            Data_source = 'Copernicus';
            
        % The Landsat data does not have such a flag
        % Instead, the NIR band provides good contrast between water and land
        elseif strcmp(NIR_band, 'Yes')
            Landsat_file_list = dir('LS-B4_Fourier_Coefficients.mat');
            Landsat_file = load(Landsat_file_list(1).name);

            Fourier_Coeff_cell = Landsat_file.Landsat_Fourier_Coeff_cell;

            % Determine the constants
            func_constants = @(FS) FS(1);
    
            Data = cellfun(func_constants, Fourier_Coeff_cell);
            
            Water_value = 30;
            
            land_indices = find(Data > Water_value);
            water_indices = Data <= Water_value;
                        
        % If the NIR band is unavailable, no water pixels are determined
        elseif strcmp(NIR_band, 'No') & strcmp(Data_source, 'Landsat')
            land_indices = 1 : rows_data * columns_data;
            water_indices = false(rows_data, columns_data);
        end

    %%% Assign new LC data values %%%
        % Missing land indices are land pixels where the LC data is missing
        Missing_value = 0;

        missing_indices = find(CLC_data <= Missing_value);   
        missing_land_indices = intersect(missing_indices, land_indices);
        
        % Missing and water data is given the 0 value
        CLC_data(missing_indices) = 0;
        CLC_data(water_indices) = 0;
        
        % Missing land data is given the value 2
        CLC_data(missing_land_indices) = 2;
        
        % The valid data is given the value 1
        CLC_data(CLC_data > 2) = 1;
        
    %%% Define initial partitions %%%
        rows_chunks = linspace(0, rows_data, number_chunks_y + 1);
        rows_chunks = round(rows_chunks);

        columns_chunks = linspace(0, columns_data, number_chunks_x + 1);
        columns_chunks = round(columns_chunks);
        
    %%% Plot %%%
        % A red background is used to show the partitions
        red_background = cat(3, ones(rows_data, columns_data), zeros(rows_data, columns_data), zeros(rows_data, columns_data));
        
        % Line width to make the partitions easier to see
        line_width = round(0.01 * rows_data);
        
        % The default partitions
        row_partitions = rows_chunks(2 : number_chunks_y);
        column_partitions = columns_chunks(2 : number_chunks_x);
        
        % The data is invisible at the partitioning lines        
        default_partitions_alpha = ones(rows_data, columns_data);

        line_width = round((line_width - 1) / 2);
        
        for i = 1:number_chunks_x -1
            column = column_partitions(i);
            default_partitions_alpha(:, column - line_width : column + line_width) = 0;
        end
        for j = 1:number_chunks_y -1
            row = row_partitions(j);
            default_partitions_alpha(row - line_width : row + line_width, :) = 0;
        end

        figure(1)
        % Set the size and white background color
        set(gcf, 'Units', 'Normalized', 'Position', [0 0 1 1]);
        set(gcf, 'color', [1, 1, 1]);
            
        hold on
        image(red_background);
        set(gca, 'YDir', 'reverse')
        axis off
        
        h = imshow(CLC_data);
        set(h, 'AlphaData', default_partitions_alpha);
        caxis([0, 2])
        title('Default partitions')
        hold off
        
        % Save the figure
        figure_name = 'Data_Clusters.png';
        
        try
            export_fig(figure_name);
        catch
            frame = getframe(1);
            im = frame2im(frame);
            [imind, cm] = rgb2ind(im, 256);
            imwrite(imind, cm, figure_name);
        end
        
        close(1)
        
    %%% Optimisation %%%
    number_chunks = number_chunks_x * number_chunks_y;      % If there is only one cluster, there are no partitions to optimise
    
    if strcmp(Cluster_Optimisation, 'On') & number_chunks > 1
        % If the partitions are already saved, they are loaded
        if exist('Optimised_Partitions.mat', 'file')
            partitions_file = load('Optimised_Partitions.mat');
            rows_chunks = partitions_file.rows_chunks;
            columns_chunks = partitions_file.columns_chunks;
            
            row_partitions = rows_chunks(2 : number_chunks_y);
            column_partitions = columns_chunks(2 : number_chunks_x);
        % Otherwise, they are computed
        else
        
        % The resolution is coarsened        
        CLC_data_c = CLC_data(1 : coarsening_factor : end, 1 : coarsening_factor : end);
        [rows_data_c, columns_data_c] = size(CLC_data_c);
        
        row_partitions = floor(row_partitions / coarsening_factor);
        column_partitions = floor(column_partitions / coarsening_factor);
        
        % The bounds
        height_chunk = round(rows_data / number_chunks_y / coarsening_factor);      % The dimensions of the chunks
        width_chunk = round(columns_data / number_chunks_x / coarsening_factor);
        
        allowed_change_rows = round(allowed_change / 100 * height_chunk);
        allowed_change_columns = round(allowed_change / 100 * width_chunk);

        lb_rows = row_partitions - allowed_change_rows;
        lb_rows = max(1, lb_rows);
        lb_columns = column_partitions - allowed_change_columns;
        lb_columns = max(1, lb_columns);

        lb = [lb_rows, lb_columns];

        ub_rows = row_partitions + allowed_change_rows;
        ub_rows = min(rows_data_c, ub_rows);
        ub_columns = column_partitions + allowed_change_columns;
        ub_columns = min(columns_data_c, ub_columns);

        ub = [ub_rows, ub_columns];

        % All parameters are integers
        number_parameters = length(ub);
        integer_parameters = 1 : number_parameters;

        % Optimisation settings
        options = optimoptions('ga', 'Display', 'off', 'useParallel', false);

        % Optimisation
        optimisation_parameters = ga(@Cluster_Optimiser, number_parameters, [], [], [], [], lb, ub, [], integer_parameters, options);

        row_partitions = optimisation_parameters(1 : number_chunks_y - 1);
        column_partitions = optimisation_parameters(number_chunks_y : number_chunks_x + number_chunks_y - 2);
        
        % Check if the bounds are reached
        diff_row_lb = row_partitions - lb_rows;
        diff_row_ub = row_partitions - ub_rows;
        diff_col_lb = column_partitions - lb_columns;
        diff_col_ub = column_partitions - ub_columns;
        
        diff_bounds = [diff_row_lb, diff_row_ub, diff_col_lb, diff_col_ub];
        
        if ~isempty(find(diff_bounds == 0, 1))
            disp('Some of the bounds have been reached during optimisation. Consider increasing the allowed change.')
        end
        
        % The resolution is inreased again
        row_partitions = coarsening_factor * row_partitions;
        
        column_partitions = coarsening_factor * column_partitions;
        
        rows_chunks = [0, row_partitions, rows_data];
        columns_chunks = [0, column_partitions, columns_data];
        
        % The partitions are saved
        save('Optimised_Partitions.mat', 'rows_chunks', 'columns_chunks');
        
        end
                
    %%% Plot %%%
        % The data is invisible at the partitioning lines        
        opt_partitions_alpha = ones(rows_data, columns_data);

        for i = 1:number_chunks_x -1
            column_opt = column_partitions(i);
            opt_partitions_alpha(:, column_opt - line_width : column_opt + line_width) = 0;
        end
        for j = 1:number_chunks_y -1
            row_opt = row_partitions(j);
            opt_partitions_alpha(row_opt - line_width : row_opt + line_width, :) = 0;
        end

        figure(2)
        % Set the size and white background color
        set(gcf, 'Units', 'Normalized', 'Position', [0 0 1 1]);
        set(gcf, 'color', [1, 1, 1]);

        hold on
        image(red_background);
        set(gca, 'YDir', 'reverse')
        axis off
        
        h = imshow(CLC_data);
        set(h, 'AlphaData', opt_partitions_alpha);
        caxis([0, 2])
        title('Optimised partitions')
        hold off
        
        % Save the figure
        figure_name = 'Data_Clusters_Optimised.png';
        
        try
            export_fig(figure_name);
        catch
            frame = getframe(2);
            im = frame2im(frame);
            [imind, cm] = rgb2ind(im, 256);
            imwrite(imind, cm, figure_name);
        end
        
        close(2)
        
        disp('The optimised partitions have been determined');
    end
        
    %%% The optimisation function %%%
    function objective = Cluster_Optimiser(optimisation_parameters)        
        % The following parameters are optimised for
        optimisation_parameters = round(optimisation_parameters);
        row_partitions_opt = optimisation_parameters(1 : number_chunks_y - 1);
        column_partitions_opt = optimisation_parameters(number_chunks_y : number_chunks_x + number_chunks_y - 2);

        rows_chunks_opt = [0, row_partitions_opt, rows_data_c];
        columns_chunks_opt = [0, column_partitions_opt, columns_data_c];
        
        % The area within each chunk is analysed
        chunk_area_matrix = zeros(number_chunks_y, number_chunks_x);
        fraction_missing_data_matrix = zeros(number_chunks_y, number_chunks_x);

        for c_x = 1:number_chunks_x
            % Bounds of this chunk
            column_W = columns_chunks_opt(c_x) + 1;
            column_E = columns_chunks_opt(c_x + 1);

            columns_chunk = column_E - column_W + 1;

            for c_y = 1:number_chunks_y
                row_N = rows_chunks_opt(c_y) + 1;
                row_S = rows_chunks_opt(c_y + 1);

                rows_chunk = row_S - row_N + 1;

                % This chunk's land cover data
                CLC_data_chunk = CLC_data_c(row_N : row_S, column_W : column_E);

                % Separate land masses within the domain 
                land_masses = bwconncomp(CLC_data_chunk, 8);

                % The area of each of these land masses
                land_areas = regionprops(land_masses, 'Area');
                land_area_list = struct2array(land_areas);

                % The ratio between missing data and valid data is computed
                missing_data = find(CLC_data_chunk == 2);
                missing_area = length(missing_data);
                land_area = sum(land_area_list);

                if ~isempty(land_area_list)
                    fraction_missing_data = missing_area / land_area;
                    fraction_missing_data_matrix(c_y, c_x) = fraction_missing_data;
                end

                % Only land masses at the edge of the domain are of interest to be divided properly
                land_bounds = regionprops(land_masses, 'BoundingBox');
                number_land_masses = land_masses.NumObjects;
                
                land_area_list_edges = NaN(1, number_land_masses);
                
                for n = 1 : number_land_masses
                    land_mass_bounds = land_bounds(n).BoundingBox;

                    bound_W = land_mass_bounds(1) + 0.5;
                    bound_N = land_mass_bounds(2) + 0.5;
                    bound_E = bound_W + land_mass_bounds(3) - 1;
                    bound_S = bound_N + land_mass_bounds(4) - 1;

                    % Check if any of the bounds are at the edge of the domain
                    if bound_W == 1 | bound_N == 1 | bound_E == columns_chunk | bound_S == rows_chunk
                        land_area_list_edges(n) = land_area_list(n);    % NaN values will be ignored later
                    end
                end

                % Minimum normalised surface area
                if ~isempty(land_area_list)
                    chunk_area = nanmin(land_area_list_edges) / (rows_chunk * columns_chunk);
                    chunk_area_matrix(c_y, c_x) = chunk_area;
                else
                    chunk_area_matrix(c_y, c_x) = 1;    % A chunk consisting of solely water pixels is also good
                end    
            end
        end

        % The first objective is to maximise the minimum surface area, as the optimiser minimises the negative has to be taken
        objective_1 = -min(min(chunk_area_matrix));    

        % The second objective is to minimise the ratio of missing data to valid data
        objective_2 = max(max(fraction_missing_data_matrix));

        % The objectives are equally weighted
        objective = objective_1 + objective_2;
    end
end