''' lecture24_hierarchical_clustering.py Dendrogram for Iris dataset and elbow method for detecting number of clusters Oliver W. Layton CS251: Data Analysis and Visualization ''' import numpy as np import matplotlib.pyplot as plt from scipy.cluster.hierarchy import dendrogram, linkage import seaborn as sns def plot_dendrogram(dists, p, title='Iris clusters'): ''' Specify the distance list (by single-linkage) and last p merges to plot in a dendrogram ''' # Add p to title title = f'{title} (Top {p} merges)' plt.title(title) plt.xlabel('Data point') plt.ylabel('Distance') # Display dendrogram of the last p cluster merges dendrogram(dists, truncate_mode='lastp', p=p, leaf_rotation=90, leaf_font_size=14, show_contracted=True) plt.show() def plot_elbow(merge_dist_list, title='Iris merge dists'): ''' Makes an elbow plot to detect number of clusters Args: merge_dist_list: 1D list of merge distances (heights). Sorted high-to-low ''' x = np.arange(1, len(merge_dist_list) + 1) plt.plot(x, merge_dist_list) plt.show() def main(): # Load in iris data data = sns.load_dataset('iris') # Extract petal length/width features, make column vector matrix x = np.matrix(data["petal_length"]).T y = np.matrix(data["petal_width"]).T # Show data in a scatterplot plt.scatter(np.array(x), np.array(y)) plt.show() xy = np.hstack([x, y]) # xy = np.hstack((x[:, np.newaxis], y[:, np.newaxis])) dists = linkage(xy, 'single') # Suppress numeric precision beyond 2 decimal places. np.set_printoptions(precision=2, suppress=True) print(dists[:, 2]) # How many of the last

cluster merges do we want to see? 20 p = 20 # Display dendrogram of the last p (20) cluster merges plot_dendrogram(dists, p) # Take last p merges (10). Merge dists are in the 3rd col of dists p = 10 top_merges = dists[-p:, 2] # We need to reverse the dist merge list from low-to-high to high-to-low top_merges = top_merges[::-1] # Display elbow plot of the last p (10) cluster merges plot_elbow(top_merges) if __name__ == '__main__': main()