Source code for thermostat.stats

import pandas as pd
import numpy as np
from scipy.stats import norm

from eemeter.location import _load_zipcode_to_lat_lng_index
from eemeter.location import _load_zipcode_to_station_index

from collections import OrderedDict
from collections import defaultdict
from itertools import chain
from warnings import warn
import json
from functools import reduce
from pkg_resources import resource_stream

from thermostat import get_version

REAL_OR_INTEGER_VALUED_COLUMNS_HEATING = [
    'n_days_in_inputfile_date_range',
    'n_days_both_heating_and_cooling',
    'n_days_insufficient_data',
    'n_core_heating_days',

    'baseline_percentile_core_heating_comfort_temperature',
    'regional_average_baseline_heating_comfort_temperature',

    'percent_savings_baseline_percentile',
    'avoided_daily_mean_core_day_runtime_baseline_percentile',
    'avoided_total_core_day_runtime_baseline_percentile',
    'baseline_daily_mean_core_day_runtime_baseline_percentile',
    'baseline_total_core_day_runtime_baseline_percentile',
    '_daily_mean_core_day_demand_baseline_baseline_percentile',
    'percent_savings_baseline_regional',
    'avoided_daily_mean_core_day_runtime_baseline_regional',
    'avoided_total_core_day_runtime_baseline_regional',
    'baseline_daily_mean_core_day_runtime_baseline_regional',
    'baseline_total_core_day_runtime_baseline_regional',
    '_daily_mean_core_day_demand_baseline_baseline_regional',
    'mean_demand',
    'alpha',
    'tau',
    'mean_sq_err',
    'root_mean_sq_err',
    'cv_root_mean_sq_err',
    'mean_abs_err',
    'mean_abs_pct_err',

    'total_core_heating_runtime',
    'total_auxiliary_heating_core_day_runtime',
    'total_emergency_heating_core_day_runtime',

    'daily_mean_core_heating_runtime',

    'rhu_00F_to_05F',
    'rhu_05F_to_10F',
    'rhu_10F_to_15F',
    'rhu_15F_to_20F',
    'rhu_20F_to_25F',
    'rhu_25F_to_30F',
    'rhu_30F_to_35F',
    'rhu_35F_to_40F',
    'rhu_40F_to_45F',
    'rhu_45F_to_50F',
    'rhu_50F_to_55F',
    'rhu_55F_to_60F',
]

REAL_OR_INTEGER_VALUED_COLUMNS_COOLING = [
    'n_days_in_inputfile_date_range',
    'n_days_both_heating_and_cooling',
    'n_days_insufficient_data',
    'n_core_cooling_days',

    'baseline_percentile_core_cooling_comfort_temperature',
    'regional_average_baseline_cooling_comfort_temperature',

    'percent_savings_baseline_percentile',
    'avoided_daily_mean_core_day_runtime_baseline_percentile',
    'avoided_total_core_day_runtime_baseline_percentile',
    'baseline_daily_mean_core_day_runtime_baseline_percentile',
    'baseline_total_core_day_runtime_baseline_percentile',
    '_daily_mean_core_day_demand_baseline_baseline_percentile',
    'percent_savings_baseline_regional',
    'avoided_daily_mean_core_day_runtime_baseline_regional',
    'avoided_total_core_day_runtime_baseline_regional',
    'baseline_daily_mean_core_day_runtime_baseline_regional',
    'baseline_total_core_day_runtime_baseline_regional',
    '_daily_mean_core_day_demand_baseline_baseline_regional',
    'mean_demand',
    'alpha',
    'tau',
    'mean_sq_err',
    'root_mean_sq_err',
    'cv_root_mean_sq_err',
    'mean_abs_err',
    'mean_abs_pct_err',

    'total_core_cooling_runtime',

    'daily_mean_core_cooling_runtime',
]

REAL_OR_INTEGER_VALUED_COLUMNS_ALL = [
    'n_days_in_inputfile_date_range',
    'n_days_both_heating_and_cooling',
    'n_days_insufficient_data',
    'n_core_cooling_days',
    'n_core_heating_days',

    'baseline_percentile_core_cooling_comfort_temperature',
    'baseline_percentile_core_heating_comfort_temperature',
    'regional_average_baseline_cooling_comfort_temperature',
    'regional_average_baseline_heating_comfort_temperature',

    'percent_savings_baseline_percentile',
    'avoided_daily_mean_core_day_runtime_baseline_percentile',
    'avoided_total_core_day_runtime_baseline_percentile',
    'baseline_daily_mean_core_day_runtime_baseline_percentile',
    'baseline_total_core_day_runtime_baseline_percentile',
    '_daily_mean_core_day_demand_baseline_baseline_percentile',
    'percent_savings_baseline_regional',
    'avoided_daily_mean_core_day_runtime_baseline_regional',
    'avoided_total_core_day_runtime_baseline_regional',
    'baseline_daily_mean_core_day_runtime_baseline_regional',
    'baseline_total_core_day_runtime_baseline_regional',
    '_daily_mean_core_day_demand_baseline_baseline_regional',
    'mean_demand',
    'alpha',
    'tau',
    'mean_sq_err',
    'root_mean_sq_err',
    'cv_root_mean_sq_err',
    'mean_abs_err',
    'mean_abs_pct_err',

    'total_core_cooling_runtime',
    'total_core_heating_runtime',
    'total_auxiliary_heating_core_day_runtime',
    'total_emergency_heating_core_day_runtime',

    'daily_mean_core_cooling_runtime',
    'daily_mean_core_heating_runtime',

    'rhu_00F_to_05F',
    'rhu_05F_to_10F',
    'rhu_10F_to_15F',
    'rhu_15F_to_20F',
    'rhu_20F_to_25F',
    'rhu_25F_to_30F',
    'rhu_30F_to_35F',
    'rhu_35F_to_40F',
    'rhu_40F_to_45F',
    'rhu_45F_to_50F',
    'rhu_50F_to_55F',
    'rhu_55F_to_60F',
]

[docs]def combine_output_dataframes(dfs): """ Combines output dataframes. Useful when combining output from batches. Parameters ---------- dfs : list of pd.DataFrame Output dataFrames to combine into one. Returns ------- out : pd.DataFrame Dataframe with combined output metadata. """ return pd.concat(dfs, ignore_index=True)
[docs]def get_filtered_stats( df, row_filter, label, heating_or_cooling, target_columns, target_baseline_method): n_rows_total = df.shape[0] filtered_df = df[[row_filter(row, df) for i, row in df.iterrows()]] n_rows_kept = filtered_df.shape[0] n_rows_discarded = n_rows_total - n_rows_kept stats = OrderedDict() stats["label"] = "{}_{}".format(label, heating_or_cooling) stats["sw_version"] = get_version() stats["n_thermostat_core_day_sets_total"] = n_rows_total stats["n_thermostat_core_day_sets_kept"] = n_rows_kept stats["n_thermostat_core_day_sets_discarded"] = n_rows_discarded if n_rows_total > 0: for column_name in target_columns: column = filtered_df[column_name].replace([np.inf, -np.inf], np.nan).dropna() # calculate quantiles and statistics mean = np.nanmean(column) sem = np.nanstd(column) / (column.count() ** .5) lower_bound = mean - (1.96 * sem) upper_bound = mean + (1.96 * sem) stats["{}_n".format(column_name)] = column.count() stats["{}_upper_bound_95_perc_conf".format(column_name)] = upper_bound stats["{}_mean".format(column_name)] = mean stats["{}_lower_bound_95_perc_conf".format(column_name)] = lower_bound stats["{}_sem".format(column_name)] = sem for quantile in [10, 20, 30, 40, 50, 60, 70, 80, 90]: stats["{}_q{}".format(column_name, quantile)] = column.quantile(quantile / 100.) return [stats] else: warn( "Not enough data to compute summary_statistics ({}_{})" .format(label, heating_or_cooling) ) return []
[docs]def compute_summary_statistics( metrics_df, target_baseline_method="baseline_percentile", advanced_filtering=False): """ Computes summary statistics for the output dataframe. Computes the following statistics for each real-valued or integer valued column in the output dataframe: mean, standard error of the mean, and deciles. Parameters ---------- df : pd.DataFrame Output for which to compute summary statistics. label : str Name for this set of thermostat outputs. target_baseline_method : {"baseline_percentile", "baseline_regional"}, default "baseline_percentile" Baselining method by which samples will be filtered according to bad fits. Returns ------- stats : collections.OrderedDict An ordered dict containing the summary statistics. Column names are as follows, in which ### is a placeholder for the name of the column: - mean: ###_mean - standard error of the mean: ###_sem - 10th quantile: ###_10q - 20th quantile: ###_20q - 30th quantile: ###_30q - 40th quantile: ###_40q - 50th quantile: ###_50q - 60th quantile: ###_60q - 70th quantile: ###_70q - 80th quantile: ###_80q - 90th quantile: ###_90q - number of non-null core day sets: ###_n The following general values are also output: - label: label - number of total core day sets: n_total_core_day_sets """ if target_baseline_method not in ["baseline_percentile", "baseline_regional"]: message = ( 'Baseline method not supported - please use one of' ' "baseline_percentile" or "baseline_regional"' ) raise ValueError(message) def _identity_filter(row, df): return True def _range_filter(row, column_name, heating_or_cooling, lower_bound=-np.inf, upper_bound=np.inf, target_baseline=False): if target_baseline: full_column_selector = "{}_{}".format(column_name, target_baseline_method) else: full_column_selector = column_name column_value = row[full_column_selector] return lower_bound < column_value < upper_bound def _percentile_range_filter(row, column_name, heating_or_cooling, df, quantile=0.0, target_baseline=False): if target_baseline: full_column_selector = "{}_{}".format(column_name, target_baseline_method) else: full_column_selector = column_name lower_bound = df[full_column_selector].dropna().quantile(0.0 + quantile) upper_bound = df[full_column_selector].dropna().quantile(1.0 - quantile) return _range_filter(row, column_name, heating_or_cooling, lower_bound, upper_bound, target_baseline) def _tau_filter_heating(row, df): return _range_filter(row, "tau", "heating", 0, 25) def _tau_filter_cooling(row, df): return _range_filter(row, "tau", "cooling", 0, 25) def _cvrmse_filter_heating(row, df): return _range_filter(row, "cv_root_mean_sq_err", "heating", upper_bound=0.6) def _cvrmse_filter_cooling(row, df): return _range_filter(row, "cv_root_mean_sq_err", "cooling", upper_bound=0.6) def _savings_filter_p01_heating(row, df): return _percentile_range_filter(row, "percent_savings", "heating", df, 0.01, True) def _savings_filter_p01_cooling(row, df): return _percentile_range_filter(row, "percent_savings", "cooling", df, 0.01, True) def _combine_filters(filters): def _new_filter(row, df): return reduce(lambda x, y: x and y(row, df), filters, True) return _new_filter def heating_stats(df, filter_, label): heating_df = df[["heating" in name for name in df["heating_or_cooling"]]] return get_filtered_stats( heating_df, filter_, label, "heating", REAL_OR_INTEGER_VALUED_COLUMNS_HEATING, target_baseline_method) def cooling_stats(df, filter_, label): cooling_df = df[["cooling" in name for name in df["heating_or_cooling"]]] return get_filtered_stats( cooling_df, filter_, label, "cooling", REAL_OR_INTEGER_VALUED_COLUMNS_COOLING, target_baseline_method) very_cold_cold_df = metrics_df[[ (cz is not None) and "Very-Cold/Cold" in cz for cz in metrics_df["climate_zone"] ]] mixed_humid_df = metrics_df[[ (cz is not None) and "Mixed-Humid" in cz for cz in metrics_df["climate_zone"] ]] mixed_dry_hot_dry_df = metrics_df[[ (cz is not None) and "Mixed-Dry/Hot-Dry" in cz for cz in metrics_df["climate_zone"] ]] hot_humid_df = metrics_df[[ (cz is not None) and "Hot-Humid" in cz for cz in metrics_df["climate_zone"] ]] marine_df = metrics_df[[ (cz is not None) and "Marine" in cz for cz in metrics_df["climate_zone"] ]] filter_0 = _identity_filter filter_1_heating = _combine_filters([_tau_filter_heating]) filter_1_cooling = _combine_filters([_tau_filter_cooling]) filter_2_heating = _combine_filters([_tau_filter_heating, _cvrmse_filter_heating]) filter_2_cooling = _combine_filters([_tau_filter_cooling, _cvrmse_filter_cooling]) filter_3_heating = _combine_filters([_tau_filter_heating, _cvrmse_filter_heating, _savings_filter_p01_heating]) filter_3_cooling = _combine_filters([_tau_filter_cooling, _cvrmse_filter_cooling, _savings_filter_p01_cooling]) if advanced_filtering: stats = list(chain.from_iterable([ heating_stats(metrics_df, filter_0, "all_no_filter"), cooling_stats(metrics_df, filter_0, "all_no_filter"), heating_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"), cooling_stats(very_cold_cold_df, filter_0, "very-cold_cold"), heating_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"), cooling_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"), heating_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"), heating_stats(hot_humid_df, filter_0, "hot-humid_no_filter"), cooling_stats(hot_humid_df, filter_0, "hot-humid_no_filter"), heating_stats(marine_df, filter_0, "marine_no_filter"), cooling_stats(marine_df, filter_0, "marine_no_filter"), heating_stats(metrics_df, filter_1_heating, "all_tau_filter"), cooling_stats(metrics_df, filter_1_cooling, "all_tau_filter"), heating_stats(very_cold_cold_df, filter_1_heating, "very-cold_cold_tau_filter"), cooling_stats(very_cold_cold_df, filter_1_cooling, "very-cold_cold_tau_filter"), heating_stats(mixed_humid_df, filter_1_heating, "mixed-humid_tau_filter"), cooling_stats(mixed_humid_df, filter_1_cooling, "mixed-humid_tau_filter"), heating_stats(mixed_dry_hot_dry_df, filter_1_heating, "mixed-dry_hot-dry_tau_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_1_cooling, "mixed-dry_hot-dry_tau_filter"), heating_stats(hot_humid_df, filter_1_heating, "hot-humid_tau_filter"), cooling_stats(hot_humid_df, filter_1_cooling, "hot-humid_tau_filter"), heating_stats(marine_df, filter_1_heating, "marine_tau_filter"), cooling_stats(marine_df, filter_1_cooling, "marine_tau_filter"), heating_stats(metrics_df, filter_2_heating, "all_tau_cvrmse_filter"), cooling_stats(metrics_df, filter_2_cooling, "all_tau_cvrmse_filter"), heating_stats(very_cold_cold_df, filter_2_heating, "very-cold_cold_tau_cvrmse_filter"), cooling_stats(very_cold_cold_df, filter_2_cooling, "very-cold_cold_tau_cvrmse_filter"), heating_stats(mixed_humid_df, filter_2_heating, "mixed-humid_tau_cvrmse_filter"), cooling_stats(mixed_humid_df, filter_2_cooling, "mixed-humid_tau_cvrmse_filter"), heating_stats(mixed_dry_hot_dry_df, filter_2_heating, "mixed-dry_hot-dry_tau_cvrmse_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_2_cooling, "mixed-dry_hot-dry_tau_cvrmse_filter"), heating_stats(hot_humid_df, filter_2_heating, "hot-humid_tau_cvrmse_filter"), cooling_stats(hot_humid_df, filter_2_cooling, "hot-humid_tau_cvrmse_filter"), heating_stats(marine_df, filter_2_heating, "marine_tau_cvrmse_filter"), cooling_stats(marine_df, filter_2_cooling, "marine_tau_cvrmse_filter"), heating_stats(metrics_df, filter_3_heating, "all_tau_cvrmse_savings_p01_filter"), cooling_stats(metrics_df, filter_3_cooling, "all_tau_cvrmse_savings_p01_filter"), heating_stats(very_cold_cold_df, filter_3_heating, "very-cold_cold_tau_cvrmse_savings_p01_filter"), cooling_stats(very_cold_cold_df, filter_3_cooling, "very-cold_cold_tau_cvrmse_savings_p01_filter"), heating_stats(mixed_humid_df, filter_3_heating, "mixed-humid_tau_cvrmse_savings_p01_filter"), cooling_stats(mixed_humid_df, filter_3_cooling, "mixed-humid_tau_cvrmse_savings_p01_filter"), heating_stats(mixed_dry_hot_dry_df, filter_3_heating, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_3_cooling, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"), heating_stats(hot_humid_df, filter_3_heating, "hot-humid_tau_cvrmse_savings_p01_filter"), cooling_stats(hot_humid_df, filter_3_cooling, "hot-humid_tau_cvrmse_savings_p01_filter"), heating_stats(marine_df, filter_3_heating, "marine_tau_cvrmse_savings_p01_filter"), cooling_stats(marine_df, filter_3_cooling, "marine_tau_cvrmse_savings_p01_filter"), ])) else: stats = list(chain.from_iterable([ heating_stats(metrics_df, filter_0, "all_no_filter"), cooling_stats(metrics_df, filter_0, "all_no_filter"), heating_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"), cooling_stats(very_cold_cold_df, filter_0, "very-cold_cold"), heating_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"), cooling_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"), heating_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"), heating_stats(hot_humid_df, filter_0, "hot-humid_no_filter"), cooling_stats(hot_humid_df, filter_0, "hot-humid_no_filter"), heating_stats(marine_df, filter_0, "marine_no_filter"), cooling_stats(marine_df, filter_0, "marine_no_filter"), heating_stats(metrics_df, filter_3_heating, "all_tau_cvrmse_savings_p01_filter"), cooling_stats(metrics_df, filter_3_cooling, "all_tau_cvrmse_savings_p01_filter"), heating_stats(very_cold_cold_df, filter_3_heating, "very-cold_cold_tau_cvrmse_savings_p01_filter"), cooling_stats(very_cold_cold_df, filter_3_cooling, "very-cold_cold_tau_cvrmse_savings_p01_filter"), heating_stats(mixed_humid_df, filter_3_heating, "mixed-humid_tau_cvrmse_savings_p01_filter"), cooling_stats(mixed_humid_df, filter_3_cooling, "mixed-humid_tau_cvrmse_savings_p01_filter"), heating_stats(mixed_dry_hot_dry_df, filter_3_heating, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_3_cooling, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"), heating_stats(hot_humid_df, filter_3_heating, "hot-humid_tau_cvrmse_savings_p01_filter"), cooling_stats(hot_humid_df, filter_3_cooling, "hot-humid_tau_cvrmse_savings_p01_filter"), heating_stats(marine_df, filter_3_heating, "marine_tau_cvrmse_savings_p01_filter"), cooling_stats(marine_df, filter_3_cooling, "marine_tau_cvrmse_savings_p01_filter"), ])) stats_dict = {stat["label"]: stat for stat in stats} def _load_climate_zone_weights(filename_or_buffer): climate_zone_keys = { "Very-Cold/Cold": "very-cold_cold", "Mixed-Humid": "mixed-humid", "Mixed-Dry/Hot-Dry": "mixed-dry_hot-dry", "Hot-Humid": "hot-humid", "Marine": "marine", } df = pd.read_csv( filename_or_buffer, usecols=["climate_zone", "heating_weight", "cooling_weight"], ).set_index("climate_zone") heating_weights = {climate_zone_keys[cz]: weight for cz, weight in df["heating_weight"].iteritems()} cooling_weights = {climate_zone_keys[cz]: weight for cz, weight in df["cooling_weight"].iteritems()} return heating_weights, cooling_weights with resource_stream('thermostat.resources', 'NationalAverageClimateZoneWeightings.csv') as f: heating_weights, cooling_weights = _load_climate_zone_weights(f) def _compute_national_weightings(stats_by_climate_zone, keys, weights): def _national_weight(key): results = [] for cz, weight in weights.items(): stat_cz = stats_by_climate_zone.get(cz) if stat_cz is None: value = None else: value = stat_cz.get(key) if pd.notnull(weight) and pd.notnull(value): results.append((weight, value)) if len(results) == 0: return None else: weighted_sum = sum([weight * value for weight, value in results]) sum_of_weights = sum([weight for weight, _ in results]) return weighted_sum / sum_of_weights stats = [ "mean", "q10", "q20", "q30", "q40", "q50", "q60", "q70", "q80", "q90", ] key_stats = [ "{}_{}".format(key, stat) for key in keys for stat in stats ] return { "{}_{}".format(key_stat, "national_weighted_mean"): _national_weight(key_stat) for key_stat in key_stats } def _compute_national_weighting_lower_and_upper_bounds( stats_by_climate_zone, keys, weights): def _compute_bounds(key): # compute sem savings means, sems, weights_ = [], [], [] for cz, weight in weights.items(): stat_cz = stats_by_climate_zone.get(cz) if stat_cz is None: mean, sem = None, None else: mean = stat_cz.get("{}_mean".format(key), None) sem = stat_cz.get("{}_sem".format(key), None) if pd.notnull(weight) and pd.notnull(mean) and pd.notnull(sem): weights_.append(weight) means.append(mean) sems.append(sem) if len(weights_) == 0: return {} else: weighted_sum = sum([ weight * mean for weight, mean in zip(weights_, means) ]) weighted_mean = weighted_sum / sum(weights_) # renormalize weighted_sem = sum([ (weight*sem) ** 2 for weight, sem in zip(weights_, sems) ]) ** 0.5 lower_bound = weighted_mean - (1.96 * weighted_sem) upper_bound = weighted_mean + (1.96 * weighted_sem) return { "{}_lower_bound_95_perc_conf_national_weighted_mean".format(key): lower_bound, "{}_upper_bound_95_perc_conf_national_weighted_mean".format(key): upper_bound } items = {} for key in keys: items.update(_compute_bounds(key)) return items national_weighting_stats = [] if advanced_filtering: filters = [ "no_filter", "tau_filter", "tau_cvrmse_filter", "tau_cvrmse_savings_p01_filter", ] else: filters = [ "no_filter", "tau_cvrmse_savings_p01_filter", ] climate_zones = [ "mixed-humid", "mixed-dry_hot-dry", "marine", "hot-humid", "very-cold_cold" ] methods = [ "baseline_percentile", "baseline_regional", ] for season_type in ["heating", "cooling"]: if season_type == "heating": weights = heating_weights else: weights = cooling_weights for filter_ in filters: stats_by_climate_zone = { cz: stats_dict.get("{}_{}_{}".format(cz, filter_, season_type)) for cz in climate_zones } keys = ["percent_savings_{}".format(method) for method in methods] national_weightings = _compute_national_weightings( stats_by_climate_zone, keys, weights) bounds = _compute_national_weighting_lower_and_upper_bounds( stats_by_climate_zone, keys, weights) national_weightings.update(bounds) national_weightings.update( {"label": "national_weighted_mean_{}_{}".format(season_type, filter_)} ) national_weighting_stats.append(national_weightings) stats = national_weighting_stats + stats return stats
[docs]def summary_statistics_to_csv(stats, filepath, product_id): """ Write metric statistics to CSV file. Parameters ---------- stats : list of dict List of outputs from thermostat.stats.compute_summary_statistics() filepath : str Filepath at which to save the suppary statistics product_id : str A combination of the connected thermostat service plus one or more connected thermostat device models that comprises the data set. Returns ------- df : pandas.DataFrame A pandas dataframe containing the output data. """ columns = [ "label", "product_id", "sw_version", ] methods = [ "baseline_percentile", "baseline_regional", ] national_weighting_columns = list(chain.from_iterable([ [ "percent_savings_{}_mean_national_weighted_mean".format(method), "percent_savings_{}_q10_national_weighted_mean".format(method), "percent_savings_{}_q20_national_weighted_mean".format(method), "percent_savings_{}_q30_national_weighted_mean".format(method), "percent_savings_{}_q40_national_weighted_mean".format(method), "percent_savings_{}_q50_national_weighted_mean".format(method), "percent_savings_{}_q60_national_weighted_mean".format(method), "percent_savings_{}_q70_national_weighted_mean".format(method), "percent_savings_{}_q80_national_weighted_mean".format(method), "percent_savings_{}_q90_national_weighted_mean".format(method), "percent_savings_{}_lower_bound_95_perc_conf_national_weighted_mean".format(method), "percent_savings_{}_upper_bound_95_perc_conf_national_weighted_mean".format(method), ] for method in methods ])) columns.extend(national_weighting_columns) columns.extend([ "n_thermostat_core_day_sets_total", "n_thermostat_core_day_sets_kept", "n_thermostat_core_day_sets_discarded", ]) for column_name in REAL_OR_INTEGER_VALUED_COLUMNS_ALL: columns.append("{}_n".format(column_name)) columns.append("{}_upper_bound_95_perc_conf".format(column_name)) columns.append("{}_mean".format(column_name)) columns.append("{}_lower_bound_95_perc_conf".format(column_name)) columns.append("{}_sem".format(column_name)) for quantile in [10, 20, 30, 40, 50, 60, 70, 80, 90]: columns.append("{}_q{}".format(column_name, quantile)) # add product_id for row in stats: row["product_id"] = product_id # transpose for readability. stats_dataframe = pd.DataFrame(stats, columns=columns).set_index('label').transpose() stats_dataframe.to_csv(filepath) return stats_dataframe