import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform

# [To do] Ophalen data
bestandspad = r' C:\Users\Data_Data.xlsx'

data = pd.read_excel(bestandspad)

# [To do] Selectie variabelen
data_clustering = data[['Var 1', 'var 2', 'var 3', 'var 4', 'var 5',
 'var 6', 'var 7', 'var 8', 'var 9', 'var 10', 'var 11']]

#labels voor dendogram
labels = data['naam'].values

# Clustering Jaccard
data_clustering_binary = data_clustering.astype(int)

def jaccard_distance(u, v):
 intersection = np.sum((u == 1) & (v == 1)) # A ∩ B
 union = np.sum((u == 1) | (v == 1)) # A ∪ B
 return 1 - (intersection / union) if union != 0 else 1 # Als de unie 0 is, return 1 (maximale afstand)

#afstsandsmatrix
dist_matrix = pdist(data_clustering_binary, metric=jaccard_distance)

dist_matrix_df = pd.DataFrame(squareform(dist_matrix), columns=data_clustering.index, index=data_clustering.index)

print(dist_matrix_df)

# Heatmap
sns.heatmap(dist_matrix_df, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Distancematrix (Jaccard)')
plt.show()

# Hiërarchisch clusteren via Complete linkage methode
Z = linkage(dist_matrix, method='complete')

# Dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z, labels=labels)

plt.xticks(rotation=45, ha="right") # Schuine labels (45 graden)

plt.title('Hiërarchical Clustering (Jaccard)')
plt.xlabel('Index')
plt.ylabel('Distance')
plt.show()
