''' lecture24_hierarchical_clustering_template.py Dendrogram for Iris dataset and elbow method for detecting number of clusters Oliver W. Layton CS251: Data Analysis and Visualization ''' import numpy as np import matplotlib.pyplot as plt from scipy.cluster.hierarchy import dendrogram, linkage import seaborn as sns def plot_dendrogram(dists, p, title='Iris clusters'): ''' Specify the distance list (by single-linkage) and last p merges to plot in a dendrogram ''' # Add p to title title = f'{title} (Top {p} merges)' plt.title(title) plt.xlabel('Data point') plt.ylabel('Distance') # Display dendrogram of the last p cluster merges dendrogram(dists, truncate_mode='lastp', p=p, leaf_rotation=90, leaf_font_size=14, show_contracted=True) plt.show() def plot_elbow(merge_dist_list, title='Iris merge dists'): ''' Makes an elbow plot to detect number of clusters Args: merge_dist_list: 1D list of merge distances (heights). Sorted high-to-low ''' x = np.arange(1, len(merge_dist_list) + 1) plt.plot(x, merge_dist_list) plt.show() def main(): # Load in iris data # Extract petal length/width features, make column vector matrix # Show data in a scatterplot # plt.scatter(np.array(x), np.array(y)) # plt.show() # Stack x-y features as columns # Run single-linkage hierarchical clustering, get the cluster merge minimum distances # Suppress numeric precision beyond 2 decimal places. # print(dists[:, 2]) # How many of the last
cluster merges do we want to see? 20 # Display dendrogram of the last p (20) cluster merges # Take last p merges (10). Merge dists are in the 3rd col of dists # We need to reverse the dist merge list from low-to-high to high-to-low # Display elbow plot of the last p (10) cluster merges if __name__ == '__main__': main()