from __future__ import print_function, division
from os import remove
from os.path import join
import pandas as pd
from datetime import datetime
from pytz import UTC
import numpy as np
from nilmtk.dataset_converters.redd.convert_redd import (_convert, _load_csv)
from nilmtk.utils import get_module_directory
from nilmtk import DataSet
from nilmtk.utils import get_datastore
from nilmtk.datastore import Key
from nilm_metadata import convert_yaml_to_hdf5
"""
TODO
----
* convert HES appliance names to NILMTK standard
* what exactly is measured? Real power? Apparent?
* houses which have multiple mains: are they multiple 'splits' or phases or meters?
* dataset metadata
* some houses have both 2- and 10-minute data. Might need a function to ignore 10 minute data.
* set up wiring to take into consideration the information in
'total_profiles.csv' Sockets 1-11 are circuits monitored
at the consumer unit which feed fall sockets around the dwelling.
* import the enormous amount of appliance metadata in 'appliance_data.csv',
especially channels which recorded multiple appliances
* use the metadata in 'ipsos.csv' and 'rdsap_data.csv' and 'rdsap_*.csv' for each Building
* Maybe email CAR to let them know that nilmtk can now import HES.
HES notes
---------
* 14 homes recorded mains but only 5 were kept after cleaning
* circuit-level data from the consumer unit was recorded as 'sockets' for 216 houses
* 'total_profiles.csv' records pairs of <house>,<appliance> which are the
channels which need to be added to produce the whole-home total, which
I think consists of all the circuit-level meters plus all appliances
which are not also monitored at circuit level.
* appliance 2000 represents the calculated aggregate ???
* appliance 159 represents the difference between ???
this and the sum of the known appliances
* appliance_codes.csv maps from <appliance code> to <appliance name>
* seasonal_adjustments.csv stores the trends in energy usage per appliance
activation over a year.
"""
FILENAMES = ['appliance_group_data-{}.csv'.format(s) for s in
['1a','1b','1c','1d','2','3']]
CHUNKSIZE = 1E5 # number of rows
COL_NAMES = ['interval id', 'house id', 'appliance code', 'date',
'data', 'time']
LAST_PWR_COLUMN = 250
NANOSECONDS_PER_TENTH_OF_AN_HOUR = 1E9 * 60 * 6
MAINS_CODES = [240, 241]
TEMPERATURE_CODES = range(251,256)
CIRCUIT_CODES = range(208, 218) + [222]
#E_MEASUREMENT = Measurement('energy', 'active')
[docs]def datetime_converter(s):
"""
Parameters
----------
s : int
of the form 2011-02-02 14:48:00
Returns
-------
datetime
"""
# 0123456789012345678
# 2011-02-02 14:48:00
# the code below is ~8 times faster
# than datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
return datetime(year=int(s[0:4]), month=int(s[5:7]), day=int(s[8:10]),
hour=int(s[11:13]), minute=int(s[14:16]),
second=int(s[17:19]), tzinfo=UTC)
[docs]def load_list_of_house_ids(data_dir):
"""Returns a list of house IDs in HES (ints)."""
filename = join(data_dir, 'ipsos-anonymised-corrected 310713.csv')
series = pd.read_csv(filename, usecols=[0], index_col=False, squeeze=True)
return series.tolist()
"""Load data from UK Government's Household Electricity Survey
(the cleaned version of the dataset released in summer 2013).
"""
# TODO: re-use code from
# https://github.com/JackKelly/pda/blob/master/scripts/hes/load_hes.py
"""
Broad approach:
* load list of houses
* create dataset.buildings dict with empty Buildings
* the keys of `dataset.buildings` are the HES house IDs, which
will be converted to the nilmtk standard after loading.
* load CHUNK_SIZE of data from CSV into a DataFrame
* convert datetime
* get list of houses in the DF
* for each house:
* Load previously converted data from the HDFStore
* append new data
* save back to HDFStore
* When all houses are complete, post-process:
* sort all indicies
* set timezone
* convert energy to nilmtk standard energy unit (kWh?)
* convert Wh to watts (retain energy)
(see 'convert_hes_to_watts.py' from pda)
* convert keys of `dataset.buildings`
"""
[docs]def convert_hes(data_dir, output_filename, format='HDF', max_chunks=None):
metadata = {
'name': 'HES',
'geographic_coordinates': (51.464462,-0.076544), # London
'timezone': 'Europe/London'
}
# Open DataStore
store = get_datastore(output_filename, format, mode='w')
# load list of appliances
full_filename = join(data_dir, 'appliance_data.csv')
hes_appliance_data = pd.read_csv(full_filename)
import ipdb
ipdb.set_trace()
hes_appliance_data[['ApplianceText']].drop_duplicates().sort(columns='ApplianceText').to_csv(join(data_dir, 'hes_to_nilmtk_appliance_lookup.csv'), index=False)
# load list of houses
house_ids = load_list_of_house_ids(data_dir)
for house_id in house_ids:
#building = Building()
#building.metadata['original_name'] = house_id
#buildings[house_id] = building
pass
houses_loaded = set()
for filename in FILENAMES:
# Load appliance energy data chunk-by-chunk
full_filename = join(data_dir, filename)
print('loading', full_filename)
try:
reader = pd.read_csv(full_filename, names=COL_NAMES,
index_col=False, chunksize=CHUNKSIZE)
except IOError as e:
print(e, file=stderr)
continue
# Process each chunks
chunk_i = 0
for chunk in reader:
if max_chunks is not None and chunk_i >= max_chunks:
break
print('processing chunk', chunk_i, 'of', filename)
# Convert date and time columns to np.datetime64 objects
dt = chunk['date'] + ' ' + chunk['time']
del chunk['date']
del chunk['time']
chunk['datetime'] = dt.apply(datetime_converter)
# Data is either tenths of a Wh or tenths of a degree
chunk['data'] *= 10
chunk['data'] = chunk['data'].astype(np.float32)
# Process each house in chunk
houses_in_chunk = chunk['house id'].unique() #TODO: use groupby?!?
houses_loaded = houses_loaded.union(set(houses_in_chunk))
for house_id in houses_in_chunk:
print(house_id)
_process_house_in_chunk(house_id, chunk, store)
chunk_i += 1
print('houses with some data loaded:', houses_loaded)
store.close()
def _process_house_in_chunk(house_id, chunk, store):
#building = buildings[house_id]
#electric = building.utility.electric
house_data = chunk[chunk['house id'] == house_id]
for appliance_code, appliance_data in house_data.groupby('appliance code'):
print('\t' + str(appliance_code))
data = appliance_data['data'].values
index = appliance_data['datetime']
df = pd.DataFrame(data=data, index=index,
columns=[('power', 'active')])
is_temperature = False
if appliance_code in MAINS_CODES:
dict_ = electric.mains
split = MAINS_CODES.index(appliance_code) + 1
key = MainsName(split=split, meter=1)
elif appliance_code in CIRCUIT_CODES:
dict_ = electric.circuits
split = CIRCUIT_CODES.index(appliance_code) + 1
key = CircuitName(name='sockets', split=split, meter=1)
elif appliance_code in TEMPERATURE_CODES:
is_temperature = True
# TODO
else:
#dict_ = electric.appliances
#key = appliance_code # TODO use nilmtk ApplianceNames
pass
if not is_temperature:
key = Key(building=house_id, meter=1)
store.append(str(key), df)