Source code for nilmtk.dataset.redd

from __future__ import print_function, division
import re
import os
import datetime
import sys
import pandas as pd
import numpy as np
from collections import namedtuple
from nilmtk.dataset import DataSet
from nilmtk.utils import get_immediate_subdirectories
from nilmtk.building import Building
from nilmtk.sensors.electricity import MainsName, Measurement, ApplianceName, DualSupply
from nilmtk.sensors.electricity import get_dual_supply_columns

# Maps from REDD name to:
#   tuple : ('<nilmtk name>', <metadata dict>)
ApplianceMetadata = namedtuple('ApplianceMetadata', ['name', 'metadata'])
APPLIANCE_NAME_MAP = {
    'oven': ApplianceMetadata('oven', {'fuel':'electricity', 'dualsupply': True}),
    'refrigerator': ApplianceMetadata('fridge', {}), 
    'dishwaser': ApplianceMetadata('dishwasher', {}),
    'kitchen_outlets': ApplianceMetadata('kitchen outlets', {}),
    'washer_dryer': ApplianceMetadata('washer dryer', {'dualsupply': True}),
    'bathroom_gfi': ApplianceMetadata('bathroom misc', {}),
    'electric_heat': ApplianceMetadata('space heater', {'fuel':'electricity'}),
    'stove': ApplianceMetadata('hob', {'fuel':'electricity'})
}

# TODO: 
# Check that these dualsupply==True appliances really are dualsupply!

# maps from house number to a list of dud REDD channel numbers
DUD_CHANNELS = {1: [19]}

[docs]def load_chan(building_dir, chan=None, filename=None, colnames=None, 
              usecols=None, sep=' '):
    """Loads CSV files where the first column is a UNIX timestamp, 
    like REDD or UKPD CSV files.

    Parameters
    ----------
    building_dir : string
        The base path
    chan : int, optional
        filename will be formed from 'building_dir/channel_<chan>.dat'
    filename : string, optinal
        if you want to load a filename not of the form `channel_<chan>.dat` then
        leave `chan` as None and provide just the `filename`, no path.
    colnames : list, optional
        The names to give to each column
    usecols : list of ints, optional
        A list of column indicies to load.  Note that the index column
        counts as column 0.  If usecols is provided then load_chan
        will run `usecols.insert(0,0)` to load the index column.
    sep : character, optional
        Defaults to ' '

    Returns 
    -------
    DataFrame.  Index is DatetimeIndex in UTC.  Data values are float32.  
        Column names are `colnames` if provided.
    """
    if colnames is None:
        colnames = [Measurement('power','active')]
    
    if filename is None:
        filename = os.path.join(building_dir, 'channel_{:d}.dat'.format(chan))
    else:
        filename = os.path.join(building_dir, filename)

    if chan is None:
        print('Attempting to load', filename, '...', end='')
    else:
        print(' {:d}'.format(chan), end='')
    if usecols:
        if 0 not in usecols:
            usecols.insert(0,0)
        if colnames and 'index' not in colnames:
            colnames.insert(0, 'index')
        print("Only using columns", usecols, '...', end='')
    sys.stdout.flush()
    # Don't use date_parser with pd.read_csv.  Instead load it all
    # and then convert to datetime.  Thanks to Nipun for linking to
    # this discussion where jreback gives this tip:
    # https://github.com/pydata/pandas/issues/3757
    try:
        df = pd.read_csv(filename, sep=sep, header=None, index_col=0,
                         parse_dates=False, names=colnames, usecols=usecols,
                         dtype={colname:np.float32 for colname in colnames
                                if colname != 'index'},
                         tupleize_cols=True)
    except Exception as e:
        print('failed:', str(e))
        raise
    else:
#        print('done.')
        df.index = pd.to_datetime((df.index.values*1E9).astype(int), utc=True)
    return df


[docs]def load_labels(data_dir):
    """Loads data from labels.dat file.

    Parameters
    ----------
    data_dir : str

    Returns
    -------
    labels : dict
        mapping channel numbers (ints) to appliance names (str)
    """
    filename = os.path.join(data_dir, 'labels.dat')
    with open(filename) as labels_file:
        lines = labels_file.readlines()

    labels = {}
    for line in lines:
        line = line.split(' ')
        # TODO add error handling if line[0] not an int
        labels[int(line[0])] = line[1].strip()

    return labels


[docs]class REDD(DataSet):
    """Load data from REDD."""

    def __init__(self):
        super(REDD, self).__init__()
        self.metadata = {
            'name': 'REDD',
            'full_name': 'Reference Energy Disaggregation Data Set',
            'urls': ['http://redd.csail.mit.edu'],
            'citations': ['J. Zico Kolter and Matthew J. Johnson.'
                          ' REDD: A public data set for energy disaggregation'
                          ' research. In proceedings of the SustKDD workshop on'
                          ' Data Mining Applications in Sustainability, 2011.'],
            'geographic_coordinates': (42.360091, -71.09416), # MIT's coorindates
            'timezone': 'US/Eastern' # MIT is on the east coast
        }

    def _pre_process_dataframe(self, df):
        df = df.sort_index() # raw REDD data isn't always sorted
        df = df.tz_convert(self.metadata['timezone'])
        return df

[docs]    def load_building(self, root_directory, building_name):
        # Construct new Building and set known attributes
        building = Building()
        building.metadata['original_name'] = building_name

        # Load labels
        building_number = int(building_name[-1])
        building_dir = os.path.join(root_directory, building_name)
        labels = load_labels(building_dir)

        print("Loading building {:d}:\n  chans: ".format(building_number), end="")
        sys.stdout.flush()

        # Remove dud channels
        try:
            dud_channels_for_building = DUD_CHANNELS[building_number]
        except KeyError:
            # DUD_CHANNELS doesn't specify dud channels for all buildings
            pass
        else:
            for dud_chan in dud_channels_for_building:
                labels.pop(dud_chan)

        # Convert appliance names from REDD to nilmtk standard names
        appliance_metadata = {}
        for chan, label in labels.iteritems():
            nilmtk_appliance = APPLIANCE_NAME_MAP.get(label)
            if nilmtk_appliance is not None:
                labels[chan] = nilmtk_appliance.name
                if nilmtk_appliance.metadata:
                    appliance_metadata[nilmtk_appliance.name] = nilmtk_appliance.metadata

        # Split channels into mains and appliances
        mains_chans = []
        appliance_chans = []
        for chan, label in labels.iteritems():            
            if label == 'mains':
                mains_chans.append(chan)
            else:
                appliance_chans.append(chan)

        # Load mains chans
        for mains_chan in mains_chans:
            mainsname = MainsName(split=mains_chan, meter=1)
            df = load_chan(building_dir, mains_chan, colnames=[Measurement('power', 'apparent')])
            df = self._pre_process_dataframe(df)
            building.utility.electric.mains[mainsname] = df

        # Load sub metered channels
        instances = {} 
        # instances is a dict which maps:
        # {<'appliance name'>: 
        #  (<index of next appliance instance>, <i of next supply>)}
        measurement = Measurement('power', 'active')
        for appliance_chan in appliance_chans:
            # Get appliance label and instance
            label = labels[appliance_chan]
            instance, supply = instances.get(label, (1,1))
            appliancename = ApplianceName(name=label, instance=instance)
            metadata = appliance_metadata.get(label)
            is_dualsupply = metadata and metadata.get('dualsupply')
            if is_dualsupply:
                colname = DualSupply(measurement, supply)
                df = load_chan(building_dir, appliance_chan, colnames=[colname])
                df = self._pre_process_dataframe(df)
                df[colname].name = appliancename
                if supply == 1:
                    building.utility.electric.appliances[appliancename] = df
                    instances[label] = (instance, supply + 1)
                else:
                    building.utility.electric.appliances[appliancename] = \
                     building.utility.electric.appliances[appliancename].join(df)
                    instances[label] = (instance + 1, 1)
            else:
                # This is not a DualSupply appliance
                instances[label] = (instance + 1, 1)
                colname = measurement
                df = load_chan(building_dir, appliance_chan, colnames=[colname])
                df = self._pre_process_dataframe(df)
                df[colname].name = appliancename
                building.utility.electric.appliances[appliancename] = df


        # Now go through all DualSupply appliances to make sure there are two chans
        appliances = building.utility.electric.appliances
        for appliance_name, appliance_df in appliances.iteritems():
            dual_supply_columns = get_dual_supply_columns(appliance_df)
            n_dual_supply_columns = len(dual_supply_columns)
            if n_dual_supply_columns == 1:
                col = dual_supply_columns[0]
#                print("converting", appliance_name, "in building", building_number)
                appliances[appliance_name].rename(columns={col:col.measurement},
                                                  inplace=True)

        # TODO
        # Store appliance_metadata for each appliance instance in electric.metadata['appliances']
        # Set up wiring

        self.buildings[building_number] = building
        print("")

[docs]    def load_building_names(self, root_directory):
        dirs = get_immediate_subdirectories(root_directory)
        pattern = re.compile('house_[0-9]*')
        dirs = [dir for dir in dirs if pattern.match(dir)]
        dirs.sort()
        return dirs