Source code for nilmtk.stats.goodsectionsresults

from __future__ import print_function, division
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
from ..results import Results
from nilmtk.timeframe import TimeFrame, convert_none_to_nat, convert_nat_to_none
from nilmtk.utils import get_tz, tz_localize_naive
from nilmtk.timeframegroup import TimeFrameGroup

[docs]class GoodSectionsResults(Results):
    """
    Attributes
    ----------
    max_sample_period_td : timedelta
    _data : pd.DataFrame
        index is start date for the whole chunk
        `end` is end date for the whole chunk
        `sections` is a TimeFrameGroups object (a list of nilmtk.TimeFrame objects)
    """
    
    name = "good_sections"

    def __init__(self, max_sample_period):
        self.max_sample_period_td = timedelta(seconds=max_sample_period)
        super(GoodSectionsResults, self).__init__()

[docs]    def append(self, timeframe, new_results):
        """Append a single result.

        Parameters
        ----------
        timeframe : nilmtk.TimeFrame
        new_results : {'sections': list of TimeFrame objects}
        """
        new_results['sections'] = [TimeFrameGroup(new_results['sections'][0])]
        super(GoodSectionsResults, self).append(timeframe, new_results)

[docs]    def combined(self):
        """Merges together any good sections which span multiple segments,
        as long as those segments are adjacent 
        (previous.end - max_sample_period <= next.start <= previous.end).

        Returns
        -------
        sections : TimeFrameGroup (a subclass of Python's list class)
        """
        sections = TimeFrameGroup()
        end_date_of_prev_row = None
        for index, row in self._data.iterrows():
            row_sections = row['sections']

            # Check if first TimeFrame of row_sections needs to be merged with
            # last TimeFrame of previous section
            if (end_date_of_prev_row is not None):

                rows_are_adjacent = (
                    (end_date_of_prev_row - self.max_sample_period_td)
                    <= index <=
                    end_date_of_prev_row)

                if rows_are_adjacent and row_sections[0].start is None:
                    assert sections[-1].end is None
                    sections[-1].end = row_sections[0].end
                    row_sections.pop(0)
                else:
                    # row_sections[0] and sections[-1] were not in adjacent chunks
                    # so check if they are both open-ended and close them...
                    if sections and sections[-1].end is None:
                        try:
                            sections[-1].end = end_date_of_prev_row
                        except ValueError: # end_date_of_prev_row before sections[-1].start
                            pass
                    if row_sections and row_sections[0].start is None:
                        try:
                            row_sections[0].start = index
                        except ValueError:
                            pass
                
            end_date_of_prev_row = row['end']
            sections.extend(row_sections)

        if sections:
            sections[-1].include_end = True
            if sections[-1].end is None:
                sections[-1].end = end_date_of_prev_row

        return sections

[docs]    def unify(self, other):
        super(GoodSectionsResults, self).unify(other)
        for start, row in self._data.iterrows():
            other_sections = other._data['sections'].loc[start]
            intersection = row['sections'].intersection(other_sections)
            self._data['sections'].loc[start] = intersection

[docs]    def to_dict(self):
        good_sections = self.combined()
        good_sections_list_of_dicts = [timeframe.to_dict() 
                                       for timeframe in good_sections]
        return {'statistics': {'good_sections': good_sections_list_of_dicts}}

[docs]    def plot(self, **kwargs):
        timeframes = self.combined()
        return timeframes.plot(**kwargs)
        
[docs]    def import_from_cache(self, cached_stat, sections):
        # we (deliberately) use duplicate indices to cache GoodSectionResults
        grouped_by_index = cached_stat.groupby(level=0)
        tz = get_tz(cached_stat)
        for tf_start, df_grouped_by_index in grouped_by_index:
            grouped_by_end = df_grouped_by_index.groupby('end')
            for tf_end, sections_df in grouped_by_end:
                end = tz_localize_naive(tf_end, tz)
                timeframe = TimeFrame(tf_start, end)
                if timeframe in sections:
                    timeframes = []
                    for _, row in sections_df.iterrows():
                        section_start = tz_localize_naive(row['section_start'], tz)
                        section_end = tz_localize_naive(row['section_end'], tz)
                        timeframes.append(TimeFrame(section_start, section_end))
                    self.append(timeframe, {'sections': [timeframes]})

[docs]    def export_to_cache(self):
        """
        Returns
        -------
        DataFrame with three columns: 'end', 'section_end', 'section_start'.
            Instead of storing a list of TimeFrames on each row,
            we store one TimeFrame per row.  This is because pd.HDFStore cannot
            save a DataFrame where one column is a list if using 'table' format'.
            We also need to strip the timezone information from the data columns.
            When we import from cache, we assume the timezone for the data 
            columns is the same as the tz for the index.
        """
        index_for_cache = []
        data_for_cache = [] # list of dicts with keys 'end', 'section_end', 'section_start'
        for index, row in self._data.iterrows():
            for section in row['sections']:
                index_for_cache.append(index)
                data_for_cache.append(
                    {'end': row['end'], 
                     'section_start': convert_none_to_nat(section.start),
                     'section_end': convert_none_to_nat(section.end)})
        df = pd.DataFrame(data_for_cache, index=index_for_cache)
        return df.convert_objects()