Source code for nilmtk.dataset_converters.greend.convert_greend

from __future__ import print_function, division
from os import listdir, getcwd
from os.path import join, isdir, isfile, dirname, abspath
import pandas as pd
import datetime
import time
from nilmtk.datastore import Key
import warnings
from nilm_metadata import convert_yaml_to_hdf5
import csv

warnings.filterwarnings("ignore")

[docs]def convert_greend(greend_path, hdf_filename):
    """
    Parameters
    ----------
    greend_path : str
        The root path of the greend dataset.
    hdf_filename : str
        The destination HDF5 filename (including path and suffix).
    """
    store = pd.HDFStore(hdf_filename, 'w', complevel=9, complib='zlib')
    houses = sorted(__get_houses(greend_path))
    print(houses)
    h = 1
    for house in houses:
        print('loading '+house)
        abs_house = join(greend_path, house)
        dates = [d for d in listdir(abs_house) if d.startswith('dataset')]
        house_data = pd.DataFrame()
        for date in dates:
            print('-----------------------',date)
            try:
                tmp_pandas = pd.DataFrame.from_csv(join(abs_house, date))
            except: # A CParserError is returned for malformed files (irregular column number)
                import StringIO as sio
                tmp_pandas = pd.DataFrame.from_csv(sio.StringIO(__preprocess_file(abs_house, date)))
                
            tmp_pandas = tmp_pandas[tmp_pandas.index != 'timestamp']
            tmp_pandas = tmp_pandas.sort_index()
            c = 0 
            tmp_pandas.index = [__timestamp(t) for t in tmp_pandas.index]
            house_data = house_data.append(tmp_pandas)
        m = 1

        for meter in house_data:
            print("meter" + str(m)+': ')
            key = Key(building = h, meter=m)
            print("Putting into store...")
            store.put(str(key), house_data[meter], format = 'table')
            m += 1
            print('Flushing store...')
            store.flush()
        h += 1

    store.close()

    #needs to be edited
    convert_yaml_to_hdf5('/path/to/metadata', hdf_filename)


def __timestamp(t):
    res = 1
    try:
        res = datetime.datetime.fromtimestamp(int(float(t)))
    except ValueError:
        print('exception'+str(t))
    return res

def __get_houses(greend_path):
    house_list = listdir(greend_path)
    return [h for h in house_list if isdir(join(greend_path,h))] 
    
def __preprocess_file(building_path, day_file):
    filename = join(building_path, day_file)
    csvfile = open(filename, 'rb')
    ff = csv.reader(csvfile, delimiter = ',', quotechar='|')
    from collections import defaultdict
    cols_nums = defaultdict(list)
    for f in ff: cols_nums[len(f)].append(f) # group by column number
    best_col_num = sorted( [(k, len(cols_nums[k])) for k in cols_nums.keys()] , key=lambda x:x[1], reverse=True) # sort rows by row_number DESC
    processed_rows = cols_nums[best_col_num[0][0]] # reject outliers (all rows with different column number)
    print("\t"+day_file+" has", best_col_num, "taking only rows with", best_col_num[0][0], "columns")    
    
    import io
    csvfile = io.BytesIO()
    writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writerows(processed_rows) # print row to csv byte stream
    return csvfile.getvalue()

#is only called when this file is the main file... only test purpose
if __name__ == '__main__':
    t1 = time.time()
    convert_greend('/home/student/Downloads/GREEND_0-1_311014/', 
                   '/home/student/Desktop/greend.h5')
    dt = time.time()- t1
    print('\n\nTime passed:\n'+str(int(dt/60))+' : ' + str(dt%60))