from __future__ import print_function, division
from os import listdir, getcwd
from os.path import join, isdir, isfile, dirname, abspath
import pandas as pd
import datetime
import time
from nilmtk.datastore import Key
import warnings
from nilm_metadata import convert_yaml_to_hdf5
import csv


[docs]def convert_greend(greend_path, hdf_filename): """ Parameters ---------- greend_path : str The root path of the greend dataset. hdf_filename : str The destination HDF5 filename (including path and suffix). """ store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib') houses = sorted(__get_houses(greend_path)) print(houses) h = 1 for house in houses: print('loading '+house) abs_house = join(greend_path, house) dates = [d for d in listdir(abs_house) if d.startswith('dataset')] house_data = pd.DataFrame() for date in dates: print('-----------------------',date) try: tmp_pandas = pd.DataFrame.from_csv(join(abs_house, date)) except: # A CParserError is returned for malformed files (irregular column number) import StringIO as sio tmp_pandas = pd.DataFrame.from_csv(sio.StringIO(__preprocess_file(abs_house, date))) tmp_pandas = tmp_pandas[tmp_pandas.index != 'timestamp'] tmp_pandas = tmp_pandas.sort_index() c = 0 tmp_pandas.index = [__timestamp(t) for t in tmp_pandas.index] house_data = house_data.append(tmp_pandas) m = 1 for meter in house_data: print("meter" + str(m)+': ') key = Key(building = h, meter=m) print("Putting into store...") store.put(str(key), house_data[meter], format = 'table') m += 1 print('Flushing store...') store.flush() h += 1 store.close() #needs to be edited convert_yaml_to_hdf5('/path/to/metadata', hdf_filename)
def __timestamp(t): res = 1 try: res = datetime.datetime.fromtimestamp(int(float(t))) except ValueError: print('exception'+str(t)) return res def __get_houses(greend_path): house_list = listdir(greend_path) return [h for h in house_list if isdir(join(greend_path,h))] def __preprocess_file(building_path, day_file): filename = join(building_path, day_file) csvfile = open(filename, 'rb') ff = csv.reader(csvfile, delimiter = ',', quotechar='|') from collections import defaultdict cols_nums = defaultdict(list) for f in ff: cols_nums[len(f)].append(f) # group by column number best_col_num = sorted( [(k, len(cols_nums[k])) for k in cols_nums.keys()] , key=lambda x:x[1], reverse=True) # sort rows by row_number DESC processed_rows = cols_nums[best_col_num[0][0]] # reject outliers (all rows with different column number) print("\t"+day_file+" has", best_col_num, "taking only rows with", best_col_num[0][0], "columns") import io csvfile = io.BytesIO() writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerows(processed_rows) # print row to csv byte stream return csvfile.getvalue() #is only called when this file is the main file... only test purpose if __name__ == '__main__': t1 = time.time() convert_greend('/home/student/Downloads/GREEND_0-1_311014/', '/home/student/Desktop/greend.h5') dt = time.time()- t1 print('\n\nTime passed:\n'+str(int(dt/60))+' : ' + str(dt%60))