import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform

# [To do] Ophalen data
bestandspad = r' C:\Users\Data_Technical.xlsx'

data=pd.read_excel(bestandspad)

# [To do] Selectie variabelen
data_clustering = data[['Var 1', 'Var 2', 'Var 3', 'Var 4','Var 5', 'Var 6', 'Var 7','Var 8', 'Var 9', 'Var 10','Var 11', 'Var 12']]

#labels
labels = data['Naam'].values

# Jaccard distance
data_clustering_binary = data_clustering.astype(int)

def jaccard_distance(u, v):
 intersection = np.sum((u == 1) & (v == 1)) # A ∩ B
 union = np.sum((u == 1) | (v == 1)) # A ∪ B
 return 1 - (intersection / union) if union != 0 else 1 # Als de unie 0 is, return 1 (maximale afstand)

dist_matrix = pdist(data_clustering_binary, metric=jaccard_distance)

dist_matrix_df = pd.DataFrame(squareform(dist_matrix), columns=data_clustering.index, index=data_clustering.index)


# Hiërarchisch clusteren met de 'complete' methode
Z = linkage(dist_matrix, method='complete')

# Dendogram
plt.figure(figsize=(10, 7))
dendrogram(Z, labels=labels)

plt.xticks(rotation=45, ha="right") # Schuine labels (45 graden)
plt.title('Hiërarchical Clustering (Jaccard)')
plt.xlabel('Index')
plt.ylabel('Distance')
plt.show()
