from __future__ import print_function, division
import numpy as np
import pandas as pd
# Fix the seed for repeatability of experiments
SEED = 42
np.random.seed(SEED)
[docs]def cluster(X, max_num_clusters=3):
'''Applies clustering on reduced data,
i.e. data where power is greater than threshold.
Parameters
----------
X : pd.Series or single-column pd.DataFrame
max_num_clusters : int
Returns
-------
centroids : ndarray of int32s
Power in different states of an appliance, sorted
'''
# Find where power consumption is greater than 10
data = _transform_data(X)
# Find clusters
centroids = _apply_clustering(data, max_num_clusters)
centroids = np.append(centroids, 0) # add 'off' state
centroids = np.round(centroids).astype(np.int32)
centroids = np.unique(centroids) # np.unique also sorts
# TODO: Merge similar clusters
return centroids
def _transform_data(data):
'''Subsamples if needed and converts to column vector (which is what
scikit-learn requires).
Parameters
----------
data : pd.Series or single column pd.DataFrame
Returns
-------
data_above_thresh : ndarray
column vector
'''
MAX_NUMBER_OF_SAMPLES = 2000
MIN_NUMBER_OF_SAMPLES = 20
DATA_THRESHOLD = 10
data_above_thresh = data[data > DATA_THRESHOLD].dropna().values
n_samples = len(data_above_thresh)
if n_samples < MIN_NUMBER_OF_SAMPLES:
return np.zeros((MAX_NUMBER_OF_SAMPLES, 1))
elif n_samples > MAX_NUMBER_OF_SAMPLES:
# Randomly subsample (we don't want to smoothly downsample
# because that is likely to change the values)
random_indices = np.random.randint(0, n_samples, MAX_NUMBER_OF_SAMPLES)
resampled = data_above_thresh[random_indices]
return resampled.reshape(MAX_NUMBER_OF_SAMPLES, 1)
else:
return data_above_thresh.reshape(n_samples, 1)
def _apply_clustering(X, max_num_clusters):
'''
Parameters
----------
X : ndarray
max_num_clusters : int
Returns
-------
centroids : list of numbers
List of power in different states of an appliance
'''
# If we import sklearn at the top of the file then it makes autodoc fail
from sklearn.cluster import KMeans
from sklearn import metrics
# sklearn produces lots of DepreciationWarnings with PyTables
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Finds whether 2 or 3 gives better Silhouellete coefficient
# Whichever is higher serves as the number of clusters for that
# appliance
num_clus = -1
sh = -1
k_means_labels = {}
k_means_cluster_centers = {}
k_means_labels_unique = {}
for n_clusters in range(1, max_num_clusters):
try:
k_means = KMeans(init='k-means++', n_clusters=n_clusters)
k_means.fit(X)
k_means_labels[n_clusters] = k_means.labels_
k_means_cluster_centers[n_clusters] = k_means.cluster_centers_
k_means_labels_unique[n_clusters] = np.unique(k_means_labels)
try:
sh_n = metrics.silhouette_score(
X, k_means_labels[n_clusters], metric='euclidean')
if sh_n > sh:
sh = sh_n
num_clus = n_clusters
except Exception:
num_clus = n_clusters
except Exception:
if num_clus > -1:
return k_means_cluster_centers[num_clus]
else:
return np.array([0])
return k_means_cluster_centers[num_clus].flatten()
[docs]def hart85_means_shift_cluster(pair_buffer_df, cols):
from sklearn.cluster import MeanShift
# Creating feature vector
cluster_df = pd.DataFrame()
power_types = [col[1] for col in cols]
if 'active' in power_types:
cluster_df['active'] = pd.Series(pair_buffer_df.apply(lambda row:
((np.fabs(row['T1 Active']) + np.fabs(row['T2 Active'])) / 2), axis=1), index=pair_buffer_df.index)
if 'reactive' in power_types:
cluster_df['reactive'] = pd.Series(pair_buffer_df.apply(lambda row:
((np.fabs(row['T1 Reactive']) + np.fabs(row['T2 Reactive'])) / 2), axis=1), index=pair_buffer_df.index)
X = cluster_df.values.reshape((len(cluster_df.index), len(cols)))
ms = MeanShift(bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
return pd.DataFrame(cluster_centers, columns=cols)