Source code for nilmtk.feature_detectors.cluster

from __future__ import print_function, division
import numpy as np
import pandas as pd 


# Fix the seed for repeatability of experiments
SEED = 42
np.random.seed(SEED)


[docs]def cluster(X, max_num_clusters=3): '''Applies clustering on reduced data, i.e. data where power is greater than threshold. Parameters ---------- X : pd.Series or single-column pd.DataFrame max_num_clusters : int Returns ------- centroids : ndarray of int32s Power in different states of an appliance, sorted ''' # Find where power consumption is greater than 10 data = _transform_data(X) # Find clusters centroids = _apply_clustering(data, max_num_clusters) centroids = np.append(centroids, 0) # add 'off' state centroids = np.round(centroids).astype(np.int32) centroids = np.unique(centroids) # np.unique also sorts # TODO: Merge similar clusters return centroids
def _transform_data(data): '''Subsamples if needed and converts to column vector (which is what scikit-learn requires). Parameters ---------- data : pd.Series or single column pd.DataFrame Returns ------- data_above_thresh : ndarray column vector ''' MAX_NUMBER_OF_SAMPLES = 2000 MIN_NUMBER_OF_SAMPLES = 20 DATA_THRESHOLD = 10 data_above_thresh = data[data > DATA_THRESHOLD].dropna().values n_samples = len(data_above_thresh) if n_samples < MIN_NUMBER_OF_SAMPLES: return np.zeros((MAX_NUMBER_OF_SAMPLES, 1)) elif n_samples > MAX_NUMBER_OF_SAMPLES: # Randomly subsample (we don't want to smoothly downsample # because that is likely to change the values) random_indices = np.random.randint(0, n_samples, MAX_NUMBER_OF_SAMPLES) resampled = data_above_thresh[random_indices] return resampled.reshape(MAX_NUMBER_OF_SAMPLES, 1) else: return data_above_thresh.reshape(n_samples, 1) def _apply_clustering(X, max_num_clusters): ''' Parameters ---------- X : ndarray max_num_clusters : int Returns ------- centroids : list of numbers List of power in different states of an appliance ''' # If we import sklearn at the top of the file then it makes autodoc fail from sklearn.cluster import KMeans from sklearn import metrics # sklearn produces lots of DepreciationWarnings with PyTables import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) # Finds whether 2 or 3 gives better Silhouellete coefficient # Whichever is higher serves as the number of clusters for that # appliance num_clus = -1 sh = -1 k_means_labels = {} k_means_cluster_centers = {} k_means_labels_unique = {} for n_clusters in range(1, max_num_clusters): try: k_means = KMeans(init='k-means++', n_clusters=n_clusters) k_means.fit(X) k_means_labels[n_clusters] = k_means.labels_ k_means_cluster_centers[n_clusters] = k_means.cluster_centers_ k_means_labels_unique[n_clusters] = np.unique(k_means_labels) try: sh_n = metrics.silhouette_score( X, k_means_labels[n_clusters], metric='euclidean') if sh_n > sh: sh = sh_n num_clus = n_clusters except Exception: num_clus = n_clusters except Exception: if num_clus > -1: return k_means_cluster_centers[num_clus] else: return np.array([0]) return k_means_cluster_centers[num_clus].flatten()
[docs]def hart85_means_shift_cluster(pair_buffer_df, cols): from sklearn.cluster import MeanShift # Creating feature vector cluster_df = pd.DataFrame() power_types = [col[1] for col in cols] if 'active' in power_types: cluster_df['active'] = pd.Series(pair_buffer_df.apply(lambda row: ((np.fabs(row['T1 Active']) + np.fabs(row['T2 Active'])) / 2), axis=1), index=pair_buffer_df.index) if 'reactive' in power_types: cluster_df['reactive'] = pd.Series(pair_buffer_df.apply(lambda row: ((np.fabs(row['T1 Reactive']) + np.fabs(row['T2 Reactive'])) / 2), axis=1), index=pair_buffer_df.index) X = cluster_df.values.reshape((len(cluster_df.index), len(cols))) ms = MeanShift(bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) return pd.DataFrame(cluster_centers, columns=cols)