Source code for nilmtk.dataset.ukpd

from __future__ import print_function, division
import re
import os
import datetime
import pandas as pd
import numpy as np
from collections import namedtuple
from nilmtk.dataset import DataSet
from nilmtk.utils import get_immediate_subdirectories
from nilmtk.building import Building
from nilmtk.sensors.electricity import MainsName, Measurement, ApplianceName, DualSupply
from nilmtk.sensors.electricity import get_dual_supply_columns
from nilmtk.dataset.redd import load_chan, load_labels, ApplianceMetadata

"""
MANUAL:

UKPD is a large dataset so this class provides some options to allow
you to load only a subset of the data.

For example, to only load house 1, and for the 1 sec mains to:
* only load active power
* downsample the 1sec mains

ukpd.load('/data/mine/vadeec/merged', 
           buildings_to_load=['house_1'], 
           one_sec_mains_params_to_load=['active'], 
           periods_to_load={1: ('2013-05-01', '2013-06-01')},
           downsample_one_sec_mains_rule='6S')
"""

"""
TODO:
* re-use more code from REDD
* put lighting_circuit into circuits
* set up wiring
* use correct measurements (some are apparent; some are active)
* convert to UKPD standard appliance names
* import metadata from house 1
* add citations to metadata
"""

# Maps from UKPD name to:
#   tuple : ('<nilmtk name>', <metadata dict>)

# TODO: fill in this map
APPLIANCE_NAME_MAP = {
    #    'oven': ApplianceMetadata('oven', {'fuel':'electricity', 'dualsupply': True}),
}

# maps from house number to a list of dud channel numbers
DUD_CHANNELS = {}

# load metadata

# Start and end times per building
DEFAULT_PERIODS_TO_LOAD = {1: ("2013-04-16", None)}

MIN_SAMPLES_TO_LOAD = 100

def _load_sometimes_unplugged(data_dir):
    """Loads data_dir/sometimes_unplugged.dat file and returns a
    list of strings.  Returns an empty list if file doesn't exist.
    """
    su_filename = os.path.join(data_dir, 'sometimes_unplugged.dat')
    try:
        file = open(su_filename)
    except IOError:
        return []
    else:
        lines = file.readlines()
        return [line.strip() for line in lines if line.strip()]


[docs]class UKPD(DataSet):

    """Load data from UKPD."""

    def __init__(self):
        super(UKPD, self).__init__()
        self.metadata = {
            'name': 'UKPD',
            'full_name': 'UK Power Dataset',
            'urls': ['http://www.doc.ic.ac.uk/~dk3810/data/'],
            # Imperial's coorindates
            'geographic_coordinates': (51.464462, -0.076544),
            'timezone': 'Europe/London'
            # TODO: citations
        }

[docs]    def load_building(self, root_directory, building_name, 
                      periods_to_load=None, 
                      one_sec_mains_params_to_load=None, 
                      downsample_one_sec_mains_rule=None):
        """
        Parameters
        ----------
        periods_to_load : dict of tuples, optional
           Key of dict is the building number (int).
           Values are (<start date>, <end date>)
           e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01")
           defaults to {1: ("2013-04-01", None)}
        one_sec_mains_params_to_load : list of strings, optional
            some combination of {'active', 'apparent', 'voltage'}
            Defaults to ['active', 'voltage']
        downsample_one_sec_mains_rule : string, optional
            How to download the 1-second mains data, if available.
            e.g. '6S'
            if None then no downsampling will be done on the 1-sec mains data.
        """

        if one_sec_mains_params_to_load is None:
            one_sec_mains_params_to_load = ['active', 'voltage']

        # Construct new Building and set known attributes
        building = Building()
        building.metadata['original_name'] = building_name
        electric = building.utility.electric

        # Load labels
        building_number = int(building_name[-1])
        building_dir = os.path.join(root_directory, building_name)
        labels = load_labels(building_dir)

        print("Loading building {:d}, orig name={}, path={}"
              .format(building_number, building_name, building_dir))

        # Process periods to load
        if periods_to_load is None:
            periods_to_load = DEFAULT_PERIODS_TO_LOAD

        start, end = periods_to_load.get(building_number, (None,None))
        if start or end:
            print("Will crop all channels for this building to start={}, end={}"
                  .format(start, end))

        # Remove dud channels
        try:
            dud_channels_for_building = DUD_CHANNELS[building_number]
        except KeyError:
            # DUD_CHANNELS doesn't specify dud channels for all buildings
            pass
        else:
            for dud_chan in dud_channels_for_building:
                labels.pop(dud_chan)

        # Convert appliance names from REDD to nilmtk standard names
        appliance_metadata = {}
        for chan, label in labels.iteritems():
            nilmtk_appliance = APPLIANCE_NAME_MAP.get(label)
            if nilmtk_appliance is not None:
                labels[chan] = nilmtk_appliance.name
                if nilmtk_appliance.metadata:
                    appliance_metadata[
                        nilmtk_appliance.name] = nilmtk_appliance.metadata

        def _pre_process_dataframe(df):
            df = df.tz_convert(self.metadata['timezone'])
            return df[start:end]

        # Load 1-second mains, if available
        usecols = []
        # columns in mains.dat are: index, active, apparent, voltage
        # usecols counts the index column as col 0
        if 'active' in one_sec_mains_params_to_load:
            usecols.append(1)
        if 'apparent' in one_sec_mains_params_to_load:
            usecols.append(2)
        if 'voltage' in one_sec_mains_params_to_load:
            usecols.append(3)
        try:
            df = load_chan(building_dir, filename='mains.dat', usecols=usecols,
                           colnames=[Measurement('power', 'active'),
                                     Measurement('power', 'apparent'),
                                     Measurement('voltage', '')])
        except IOError:
            # some houses don't have 1-second mains
            pass
        else:
            df = _pre_process_dataframe(df)
            if downsample_one_sec_mains_rule:
                df = df.resample(rule=downsample_one_sec_mains_rule, how='mean')
            if len(df) > MIN_SAMPLES_TO_LOAD:
                electric.mains[MainsName(split=1, meter=1)] = df

        # Split channels into mains and appliances
        mains_chan = None
        appliance_chans = []
        for chan, label in labels.iteritems():
            if label == 'aggregate':
                mains_chan = chan
            else:
                appliance_chans.append(chan)

        # Load Current Cost mains chans (only if we haven't loaded 1sec mains)
        if mains_chan and electric.mains.get(MainsName(1,1)) is None:
            mainsname = MainsName(split=1, meter=1)
            df = load_chan(building_dir, mains_chan,
                           colnames=[Measurement('power', 'apparent')])
            df = _pre_process_dataframe(df)
            electric.mains[mainsname] = df

        # Load sub metered channels
        instances = {}
        # instances is a dict which maps:
        # {<'appliance name'>: <index of next appliance instance>}
        measurement = Measurement('power', 'active')
        for appliance_chan in appliance_chans:
            # Get appliance label and instance
            label = labels[appliance_chan]
            instance = instances.get(label, 1)
            appliancename = ApplianceName(name=label, instance=instance)
            instances[label] = instance + 1
            df = load_chan(building_dir, appliance_chan, colnames=[measurement])
            df = _pre_process_dataframe(df)
            df[measurement].name = appliancename
            if len(df) > MIN_SAMPLES_TO_LOAD:
                electric.appliances[appliancename] = df

        self.buildings[building_number] = building

[docs]    def load_building_names(self, root_directory):
        dirs = get_immediate_subdirectories(root_directory)
        pattern = re.compile('house_[0-9]*')
        dirs = [dir for dir in dirs if pattern.match(dir)]
        dirs.sort()
        return dirs