Source code for nilmtk.dataset_converters.hes.convert_hes

from __future__ import print_function, division
from os import remove
from os.path import join
import pandas as pd
from datetime import datetime
from pytz import UTC
import numpy as np
from nilmtk.dataset_converters.redd.convert_redd import (_convert, _load_csv)
from nilmtk.utils import get_module_directory
from nilmtk import DataSet
from nilmtk.utils import get_datastore
from nilmtk.datastore import Key
from nilm_metadata import convert_yaml_to_hdf5

"""
TODO
----
* convert HES appliance names to NILMTK standard
* what exactly is measured? Real power? Apparent?
* houses which have multiple mains: are they multiple 'splits' or phases or meters?
* dataset metadata
* some houses have both 2- and 10-minute data.  Might need a function to ignore 10 minute data.
* set up wiring to take into consideration the information in 
  'total_profiles.csv'  Sockets 1-11 are circuits monitored
  at the consumer unit which feed fall sockets around the dwelling.
* import the enormous amount of appliance metadata in 'appliance_data.csv', 
  especially channels which recorded multiple appliances
* use the metadata in 'ipsos.csv' and 'rdsap_data.csv' and 'rdsap_*.csv' for each Building
* Maybe email CAR to let them know that nilmtk can now import HES.
HES notes
---------
* 14 homes recorded mains but only 5 were kept after cleaning
* circuit-level data from the consumer unit was recorded as 'sockets' for 216 houses
* 'total_profiles.csv' records pairs of <house>,<appliance> which are the 
  channels which need to be added to produce the whole-home total, which
  I think consists of all the circuit-level meters plus all appliances
  which are not also monitored at circuit level.
* appliance 2000 represents the calculated aggregate ???
* appliance 159 represents the difference between ???
  this and the sum of the known appliances
* appliance_codes.csv maps from <appliance code> to <appliance name>
* seasonal_adjustments.csv stores the trends in energy usage per appliance 
  activation over a year.
"""

FILENAMES = ['appliance_group_data-{}.csv'.format(s) for s in
             ['1a','1b','1c','1d','2','3']]
CHUNKSIZE = 1E5 # number of rows
COL_NAMES = ['interval id', 'house id', 'appliance code', 'date',
             'data', 'time']
LAST_PWR_COLUMN = 250
NANOSECONDS_PER_TENTH_OF_AN_HOUR = 1E9 * 60 * 6
MAINS_CODES = [240, 241]
TEMPERATURE_CODES = range(251,256)
CIRCUIT_CODES = range(208, 218) + [222]
#E_MEASUREMENT = Measurement('energy', 'active')


[docs]def datetime_converter(s): """ Parameters ---------- s : int of the form 2011-02-02 14:48:00 Returns ------- datetime """ # 0123456789012345678 # 2011-02-02 14:48:00 # the code below is ~8 times faster # than datetime.strptime(s, '%Y-%m-%d %H:%M:%S') return datetime(year=int(s[0:4]), month=int(s[5:7]), day=int(s[8:10]), hour=int(s[11:13]), minute=int(s[14:16]), second=int(s[17:19]), tzinfo=UTC)
[docs]def load_list_of_house_ids(data_dir): """Returns a list of house IDs in HES (ints).""" filename = join(data_dir, 'ipsos-anonymised-corrected 310713.csv') series = pd.read_csv(filename, usecols=[0], index_col=False, squeeze=True) return series.tolist() """Load data from UK Government's Household Electricity Survey (the cleaned version of the dataset released in summer 2013). """ # TODO: re-use code from # https://github.com/JackKelly/pda/blob/master/scripts/hes/load_hes.py """ Broad approach: * load list of houses * create dataset.buildings dict with empty Buildings * the keys of `dataset.buildings` are the HES house IDs, which will be converted to the nilmtk standard after loading. * load CHUNK_SIZE of data from CSV into a DataFrame * convert datetime * get list of houses in the DF * for each house: * Load previously converted data from the HDFStore * append new data * save back to HDFStore * When all houses are complete, post-process: * sort all indicies * set timezone * convert energy to nilmtk standard energy unit (kWh?) * convert Wh to watts (retain energy) (see 'convert_hes_to_watts.py' from pda) * convert keys of `dataset.buildings` """
[docs]def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None): metadata = { 'name': 'HES', 'geographic_coordinates': (51.464462,-0.076544), # London 'timezone': 'Europe/London' } # Open DataStore store = get_datastore(output_filename, format, mode='w') # load list of appliances full_filename = join(data_dir, 'appliance_data.csv') hes_appliance_data = pd.read_csv(full_filename) import ipdb ipdb.set_trace() hes_appliance_data[['ApplianceText']].drop_duplicates().sort(columns='ApplianceText').to_csv(join(data_dir, 'hes_to_nilmtk_appliance_lookup.csv'), index=False) # load list of houses house_ids = load_list_of_house_ids(data_dir) for house_id in house_ids: #building = Building() #building.metadata['original_name'] = house_id #buildings[house_id] = building pass houses_loaded = set() for filename in FILENAMES: # Load appliance energy data chunk-by-chunk full_filename = join(data_dir, filename) print('loading', full_filename) try: reader = pd.read_csv(full_filename, names=COL_NAMES, index_col=False, chunksize=CHUNKSIZE) except IOError as e: print(e, file=stderr) continue # Process each chunks chunk_i = 0 for chunk in reader: if max_chunks is not None and chunk_i >= max_chunks: break print('processing chunk', chunk_i, 'of', filename) # Convert date and time columns to np.datetime64 objects dt = chunk['date'] + ' ' + chunk['time'] del chunk['date'] del chunk['time'] chunk['datetime'] = dt.apply(datetime_converter) # Data is either tenths of a Wh or tenths of a degree chunk['data'] *= 10 chunk['data'] = chunk['data'].astype(np.float32) # Process each house in chunk houses_in_chunk = chunk['house id'].unique() #TODO: use groupby?!? houses_loaded = houses_loaded.union(set(houses_in_chunk)) for house_id in houses_in_chunk: print(house_id) _process_house_in_chunk(house_id, chunk, store) chunk_i += 1 print('houses with some data loaded:', houses_loaded) store.close()
def _process_house_in_chunk(house_id, chunk, store): #building = buildings[house_id] #electric = building.utility.electric house_data = chunk[chunk['house id'] == house_id] for appliance_code, appliance_data in house_data.groupby('appliance code'): print('\t' + str(appliance_code)) data = appliance_data['data'].values index = appliance_data['datetime'] df = pd.DataFrame(data=data, index=index, columns=[('power', 'active')]) is_temperature = False if appliance_code in MAINS_CODES: dict_ = electric.mains split = MAINS_CODES.index(appliance_code) + 1 key = MainsName(split=split, meter=1) elif appliance_code in CIRCUIT_CODES: dict_ = electric.circuits split = CIRCUIT_CODES.index(appliance_code) + 1 key = CircuitName(name='sockets', split=split, meter=1) elif appliance_code in TEMPERATURE_CODES: is_temperature = True # TODO else: #dict_ = electric.appliances #key = appliance_code # TODO use nilmtk ApplianceNames pass if not is_temperature: key = Key(building=house_id, meter=1) store.append(str(key), df)