Source code for nilmtk.dataset.redd

from __future__ import print_function, division
import re
import os
import datetime
import sys
import pandas as pd
import numpy as np
from collections import namedtuple
from nilmtk.dataset import DataSet
from nilmtk.utils import get_immediate_subdirectories
from nilmtk.building import Building
from nilmtk.sensors.electricity import MainsName, Measurement, ApplianceName, DualSupply
from nilmtk.sensors.electricity import get_dual_supply_columns

# Maps from REDD name to:
#   tuple : ('<nilmtk name>', <metadata dict>)
ApplianceMetadata = namedtuple('ApplianceMetadata', ['name', 'metadata'])
APPLIANCE_NAME_MAP = {
    'oven': ApplianceMetadata('oven', {'fuel':'electricity', 'dualsupply': True}),
    'refrigerator': ApplianceMetadata('fridge', {}), 
    'dishwaser': ApplianceMetadata('dishwasher', {}),
    'kitchen_outlets': ApplianceMetadata('kitchen outlets', {}),
    'washer_dryer': ApplianceMetadata('washer dryer', {'dualsupply': True}),
    'bathroom_gfi': ApplianceMetadata('bathroom misc', {}),
    'electric_heat': ApplianceMetadata('space heater', {'fuel':'electricity'}),
    'stove': ApplianceMetadata('hob', {'fuel':'electricity'})
}

# TODO: 
# Check that these dualsupply==True appliances really are dualsupply!

# maps from house number to a list of dud REDD channel numbers
DUD_CHANNELS = {1: [19]}

[docs]def load_chan(building_dir, chan=None, filename=None, colnames=None, usecols=None, sep=' '): """Loads CSV files where the first column is a UNIX timestamp, like REDD or UKPD CSV files. Parameters ---------- building_dir : string The base path chan : int, optional filename will be formed from 'building_dir/channel_<chan>.dat' filename : string, optinal if you want to load a filename not of the form `channel_<chan>.dat` then leave `chan` as None and provide just the `filename`, no path. colnames : list, optional The names to give to each column usecols : list of ints, optional A list of column indicies to load. Note that the index column counts as column 0. If usecols is provided then load_chan will run `usecols.insert(0,0)` to load the index column. sep : character, optional Defaults to ' ' Returns ------- DataFrame. Index is DatetimeIndex in UTC. Data values are float32. Column names are `colnames` if provided. """ if colnames is None: colnames = [Measurement('power','active')] if filename is None: filename = os.path.join(building_dir, 'channel_{:d}.dat'.format(chan)) else: filename = os.path.join(building_dir, filename) if chan is None: print('Attempting to load', filename, '...', end='') else: print(' {:d}'.format(chan), end='') if usecols: if 0 not in usecols: usecols.insert(0,0) if colnames and 'index' not in colnames: colnames.insert(0, 'index') print("Only using columns", usecols, '...', end='') sys.stdout.flush() # Don't use date_parser with pd.read_csv. Instead load it all # and then convert to datetime. Thanks to Nipun for linking to # this discussion where jreback gives this tip: # https://github.com/pydata/pandas/issues/3757 try: df = pd.read_csv(filename, sep=sep, header=None, index_col=0, parse_dates=False, names=colnames, usecols=usecols, dtype={colname:np.float32 for colname in colnames if colname != 'index'}, tupleize_cols=True) except Exception as e: print('failed:', str(e)) raise else: # print('done.') df.index = pd.to_datetime((df.index.values*1E9).astype(int), utc=True) return df
[docs]def load_labels(data_dir): """Loads data from labels.dat file. Parameters ---------- data_dir : str Returns ------- labels : dict mapping channel numbers (ints) to appliance names (str) """ filename = os.path.join(data_dir, 'labels.dat') with open(filename) as labels_file: lines = labels_file.readlines() labels = {} for line in lines: line = line.split(' ') # TODO add error handling if line[0] not an int labels[int(line[0])] = line[1].strip() return labels
[docs]class REDD(DataSet): """Load data from REDD.""" def __init__(self): super(REDD, self).__init__() self.metadata = { 'name': 'REDD', 'full_name': 'Reference Energy Disaggregation Data Set', 'urls': ['http://redd.csail.mit.edu'], 'citations': ['J. Zico Kolter and Matthew J. Johnson.' ' REDD: A public data set for energy disaggregation' ' research. In proceedings of the SustKDD workshop on' ' Data Mining Applications in Sustainability, 2011.'], 'geographic_coordinates': (42.360091, -71.09416), # MIT's coorindates 'timezone': 'US/Eastern' # MIT is on the east coast } def _pre_process_dataframe(self, df): df = df.sort_index() # raw REDD data isn't always sorted df = df.tz_convert(self.metadata['timezone']) return df
[docs] def load_building(self, root_directory, building_name): # Construct new Building and set known attributes building = Building() building.metadata['original_name'] = building_name # Load labels building_number = int(building_name[-1]) building_dir = os.path.join(root_directory, building_name) labels = load_labels(building_dir) print("Loading building {:d}:\n chans: ".format(building_number), end="") sys.stdout.flush() # Remove dud channels try: dud_channels_for_building = DUD_CHANNELS[building_number] except KeyError: # DUD_CHANNELS doesn't specify dud channels for all buildings pass else: for dud_chan in dud_channels_for_building: labels.pop(dud_chan) # Convert appliance names from REDD to nilmtk standard names appliance_metadata = {} for chan, label in labels.iteritems(): nilmtk_appliance = APPLIANCE_NAME_MAP.get(label) if nilmtk_appliance is not None: labels[chan] = nilmtk_appliance.name if nilmtk_appliance.metadata: appliance_metadata[nilmtk_appliance.name] = nilmtk_appliance.metadata # Split channels into mains and appliances mains_chans = [] appliance_chans = [] for chan, label in labels.iteritems(): if label == 'mains': mains_chans.append(chan) else: appliance_chans.append(chan) # Load mains chans for mains_chan in mains_chans: mainsname = MainsName(split=mains_chan, meter=1) df = load_chan(building_dir, mains_chan, colnames=[Measurement('power', 'apparent')]) df = self._pre_process_dataframe(df) building.utility.electric.mains[mainsname] = df # Load sub metered channels instances = {} # instances is a dict which maps: # {<'appliance name'>: # (<index of next appliance instance>, <i of next supply>)} measurement = Measurement('power', 'active') for appliance_chan in appliance_chans: # Get appliance label and instance label = labels[appliance_chan] instance, supply = instances.get(label, (1,1)) appliancename = ApplianceName(name=label, instance=instance) metadata = appliance_metadata.get(label) is_dualsupply = metadata and metadata.get('dualsupply') if is_dualsupply: colname = DualSupply(measurement, supply) df = load_chan(building_dir, appliance_chan, colnames=[colname]) df = self._pre_process_dataframe(df) df[colname].name = appliancename if supply == 1: building.utility.electric.appliances[appliancename] = df instances[label] = (instance, supply + 1) else: building.utility.electric.appliances[appliancename] = \ building.utility.electric.appliances[appliancename].join(df) instances[label] = (instance + 1, 1) else: # This is not a DualSupply appliance instances[label] = (instance + 1, 1) colname = measurement df = load_chan(building_dir, appliance_chan, colnames=[colname]) df = self._pre_process_dataframe(df) df[colname].name = appliancename building.utility.electric.appliances[appliancename] = df # Now go through all DualSupply appliances to make sure there are two chans appliances = building.utility.electric.appliances for appliance_name, appliance_df in appliances.iteritems(): dual_supply_columns = get_dual_supply_columns(appliance_df) n_dual_supply_columns = len(dual_supply_columns) if n_dual_supply_columns == 1: col = dual_supply_columns[0] # print("converting", appliance_name, "in building", building_number) appliances[appliance_name].rename(columns={col:col.measurement}, inplace=True) # TODO # Store appliance_metadata for each appliance instance in electric.metadata['appliances'] # Set up wiring self.buildings[building_number] = building print("")
[docs] def load_building_names(self, root_directory): dirs = get_immediate_subdirectories(root_directory) pattern = re.compile('house_[0-9]*') dirs = [dir for dir in dirs if pattern.match(dir)] dirs.sort() return dirs