Source code for nilmtk.dataset.ukpd

from __future__ import print_function, division
import re
import os
import datetime
import pandas as pd
import numpy as np
from collections import namedtuple
from nilmtk.dataset import DataSet
from nilmtk.utils import get_immediate_subdirectories
from nilmtk.building import Building
from nilmtk.sensors.electricity import MainsName, Measurement, ApplianceName, DualSupply
from nilmtk.sensors.electricity import get_dual_supply_columns
from nilmtk.dataset.redd import load_chan, load_labels, ApplianceMetadata

"""
MANUAL:

UKPD is a large dataset so this class provides some options to allow
you to load only a subset of the data.

For example, to only load house 1, and for the 1 sec mains to:
* only load active power
* downsample the 1sec mains

ukpd.load('/data/mine/vadeec/merged', 
           buildings_to_load=['house_1'], 
           one_sec_mains_params_to_load=['active'], 
           periods_to_load={1: ('2013-05-01', '2013-06-01')},
           downsample_one_sec_mains_rule='6S')
"""

"""
TODO:
* re-use more code from REDD
* put lighting_circuit into circuits
* set up wiring
* use correct measurements (some are apparent; some are active)
* convert to UKPD standard appliance names
* import metadata from house 1
* add citations to metadata
"""

# Maps from UKPD name to:
#   tuple : ('<nilmtk name>', <metadata dict>)

# TODO: fill in this map
APPLIANCE_NAME_MAP = {
    #    'oven': ApplianceMetadata('oven', {'fuel':'electricity', 'dualsupply': True}),
}

# maps from house number to a list of dud channel numbers
DUD_CHANNELS = {}

# load metadata

# Start and end times per building
DEFAULT_PERIODS_TO_LOAD = {1: ("2013-04-16", None)}

MIN_SAMPLES_TO_LOAD = 100

def _load_sometimes_unplugged(data_dir):
    """Loads data_dir/sometimes_unplugged.dat file and returns a
    list of strings.  Returns an empty list if file doesn't exist.
    """
    su_filename = os.path.join(data_dir, 'sometimes_unplugged.dat')
    try:
        file = open(su_filename)
    except IOError:
        return []
    else:
        lines = file.readlines()
        return [line.strip() for line in lines if line.strip()]


[docs]class UKPD(DataSet): """Load data from UKPD.""" def __init__(self): super(UKPD, self).__init__() self.metadata = { 'name': 'UKPD', 'full_name': 'UK Power Dataset', 'urls': ['http://www.doc.ic.ac.uk/~dk3810/data/'], # Imperial's coorindates 'geographic_coordinates': (51.464462, -0.076544), 'timezone': 'Europe/London' # TODO: citations }
[docs] def load_building(self, root_directory, building_name, periods_to_load=None, one_sec_mains_params_to_load=None, downsample_one_sec_mains_rule=None): """ Parameters ---------- periods_to_load : dict of tuples, optional Key of dict is the building number (int). Values are (<start date>, <end date>) e.g. ("2013-04-01", None) or ("2013-04-01", "2013-08-01") defaults to {1: ("2013-04-01", None)} one_sec_mains_params_to_load : list of strings, optional some combination of {'active', 'apparent', 'voltage'} Defaults to ['active', 'voltage'] downsample_one_sec_mains_rule : string, optional How to download the 1-second mains data, if available. e.g. '6S' if None then no downsampling will be done on the 1-sec mains data. """ if one_sec_mains_params_to_load is None: one_sec_mains_params_to_load = ['active', 'voltage'] # Construct new Building and set known attributes building = Building() building.metadata['original_name'] = building_name electric = building.utility.electric # Load labels building_number = int(building_name[-1]) building_dir = os.path.join(root_directory, building_name) labels = load_labels(building_dir) print("Loading building {:d}, orig name={}, path={}" .format(building_number, building_name, building_dir)) # Process periods to load if periods_to_load is None: periods_to_load = DEFAULT_PERIODS_TO_LOAD start, end = periods_to_load.get(building_number, (None,None)) if start or end: print("Will crop all channels for this building to start={}, end={}" .format(start, end)) # Remove dud channels try: dud_channels_for_building = DUD_CHANNELS[building_number] except KeyError: # DUD_CHANNELS doesn't specify dud channels for all buildings pass else: for dud_chan in dud_channels_for_building: labels.pop(dud_chan) # Convert appliance names from REDD to nilmtk standard names appliance_metadata = {} for chan, label in labels.iteritems(): nilmtk_appliance = APPLIANCE_NAME_MAP.get(label) if nilmtk_appliance is not None: labels[chan] = nilmtk_appliance.name if nilmtk_appliance.metadata: appliance_metadata[ nilmtk_appliance.name] = nilmtk_appliance.metadata def _pre_process_dataframe(df): df = df.tz_convert(self.metadata['timezone']) return df[start:end] # Load 1-second mains, if available usecols = [] # columns in mains.dat are: index, active, apparent, voltage # usecols counts the index column as col 0 if 'active' in one_sec_mains_params_to_load: usecols.append(1) if 'apparent' in one_sec_mains_params_to_load: usecols.append(2) if 'voltage' in one_sec_mains_params_to_load: usecols.append(3) try: df = load_chan(building_dir, filename='mains.dat', usecols=usecols, colnames=[Measurement('power', 'active'), Measurement('power', 'apparent'), Measurement('voltage', '')]) except IOError: # some houses don't have 1-second mains pass else: df = _pre_process_dataframe(df) if downsample_one_sec_mains_rule: df = df.resample(rule=downsample_one_sec_mains_rule, how='mean') if len(df) > MIN_SAMPLES_TO_LOAD: electric.mains[MainsName(split=1, meter=1)] = df # Split channels into mains and appliances mains_chan = None appliance_chans = [] for chan, label in labels.iteritems(): if label == 'aggregate': mains_chan = chan else: appliance_chans.append(chan) # Load Current Cost mains chans (only if we haven't loaded 1sec mains) if mains_chan and electric.mains.get(MainsName(1,1)) is None: mainsname = MainsName(split=1, meter=1) df = load_chan(building_dir, mains_chan, colnames=[Measurement('power', 'apparent')]) df = _pre_process_dataframe(df) electric.mains[mainsname] = df # Load sub metered channels instances = {} # instances is a dict which maps: # {<'appliance name'>: <index of next appliance instance>} measurement = Measurement('power', 'active') for appliance_chan in appliance_chans: # Get appliance label and instance label = labels[appliance_chan] instance = instances.get(label, 1) appliancename = ApplianceName(name=label, instance=instance) instances[label] = instance + 1 df = load_chan(building_dir, appliance_chan, colnames=[measurement]) df = _pre_process_dataframe(df) df[measurement].name = appliancename if len(df) > MIN_SAMPLES_TO_LOAD: electric.appliances[appliancename] = df self.buildings[building_number] = building
[docs] def load_building_names(self, root_directory): dirs = get_immediate_subdirectories(root_directory) pattern = re.compile('house_[0-9]*') dirs = [dir for dir in dirs if pattern.match(dir)] dirs.sort() return dirs