res_reg_lmnt_awikner.process_test_data
1#!/homes/awikner1/anaconda3/envs/res39/bin/python -u 2# Assume will be finished in no more than 18 hours 3# SBATCH -d afterok:{{JOB_ID}} 4# SBATCH -J {{JOB_NAME}} 5# SBATCH --output=log_files/{{JOB_NAME}}.log 6# SBATCH -t 4:00:00 7# SBATCH -A {{ACCOUNT}} 8# Launch on 12 cores distributed over as many nodes as needed 9# SBATCH --ntasks=1 10# Assume need 6 GB/core (6144 MB/core) 11# SBATCH --mem-per-cpu=6144 12# SBATCH --mail-user=awikner1@umd.edu 13# SBATCH --mail-type=BEGIN 14# SBATCH --mail-type=END 15from itertools import product 16import sys 17import os 18import numpy as np 19import pandas as pd 20import time 21 22from res_reg_lmnt_awikner.classes import RunOpts 23from res_reg_lmnt_awikner.helpers import get_windows_path, get_filename 24 25 26def process_data(argv=None, run_opts=None): 27 saved_flag = False 28 29 tic = time.perf_counter() 30 if not isinstance(argv, type(None)) and isinstance(run_opts, type(None)): 31 run_opts = RunOpts(argv) 32 33 raw_data_size = 0 34 for ele in os.scandir(run_opts.run_folder_name): 35 raw_data_size += os.stat(ele).st_size 36 print('Raw data size: %0.2f kB' % (raw_data_size / 1000.)) 37 noise_vals = run_opts.noise_values_array 38 reg_values = run_opts.reg_values 39 reg_train_times = run_opts.reg_train_times 40 41 #print("Regularization training times:") 42 #print(reg_train_times) 43 #print(type(reg_train_times)) 44 #print(reg_train_times.shape) 45 46 rkTime = run_opts.test_time 47 split = run_opts.sync_time 48 num_vt_tests = (rkTime - split) // run_opts.max_valid_time 49 50 # def get_stability_output(out_full, data_path, filename, noise_indices, train_indices, res_per_test, run_opts.num_tests, reg_values, savepred, save_time_rms, run_opts.pmap, rkTime, split, metric = 'mss_var', return_all = False):#metric='pmap_max_wass_dist'): 51 # Function to process the output from all of the different reservoirs, trainning data sets, and noise values tested by find_stability. 52 # If return_all is True, this simply unpacks the linear output from the find_stability loop. 53 # If not, then this function returns only the results using the most optimal regularization (as defined by the metric) and using no regulariation. 54 res_vals = np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test) 55 train_vals = np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains) 56 test_vals = np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests) 57 #print('Trains:') 58 #print(train_vals) 59 #print('Noise:') 60 #print(noise_vals) 61 #print('Res') 62 #print(run_opts.res_per_test) 63 #print('Regs:') 64 #print(reg_values) 65 66 stable_frac = np.zeros( 67 (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size)) 68 train_mean_rms = np.zeros( 69 (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size)) 70 train_max_rms = np.zeros( 71 (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size)) 72 mean_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size, 73 reg_train_times.size)) 74 max_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size, 75 reg_train_times.size)) 76 variances = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size, 77 reg_train_times.size)) 78 valid_time = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, num_vt_tests, 79 reg_values.size, reg_train_times.size)) 80 if run_opts.save_time_rms: 81 # mean_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1, reg_values.size)) 82 # variances_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1, reg_values.size)) 83 rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime - split), 84 reg_values.size, reg_train_times.size)) 85 if run_opts.pmap: 86 pmap_max_wass_dist = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, 87 reg_values.size, reg_train_times.size)) 88 if run_opts.save_eigenvals: 89 eigenvals_in = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size), 90 dtype=object) 91 92 #print(np.arange(run_opts.res_per_test, dtype=int)) 93 #print(np.arange(run_opts.num_trains, dtype=int)) 94 #print(list(enumerate(noise_vals))) 95 #print(list(enumerate(reg_train_times))) 96 print('Loading in raw data...') 97 load_tic = time.perf_counter() 98 if os.name == 'nt': 99 for (i, res), (j, train), (k, noise), (l, reg_train_time) in product( 100 enumerate(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test, dtype=int)), 101 enumerate(np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains, dtype=int)), 102 enumerate(noise_vals), enumerate(reg_train_times)): 103 stable_frac[i, j, k, :, l] = np.loadtxt( 104 get_windows_path( 105 get_filename(run_opts.run_folder_name, 'stable_frac', res, train, noise, reg_train_time)), 106 delimiter=',') 107 train_mean_rms[i, j, k, :, l] = np.loadtxt( 108 get_windows_path( 109 get_filename(run_opts.run_folder_name, 'train_mean_rms', res, train, noise, reg_train_time)), 110 delimiter=',') 111 train_max_rms[i, j, k, :, l] = np.loadtxt( 112 get_windows_path( 113 get_filename(run_opts.run_folder_name, 'train_max_rms', res, train, noise, reg_train_time)), 114 delimiter=',') 115 mean_rms[i, j, :, k, :, l] = np.transpose( 116 np.loadtxt(get_windows_path( 117 get_filename(run_opts.run_folder_name, 'mean_rms', res, train, noise, reg_train_time)), 118 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 119 max_rms[i, j, :, k, :, l] = np.transpose( 120 np.loadtxt(get_windows_path( 121 get_filename(run_opts.run_folder_name, 'max_rms', res, train, noise, reg_train_time)), 122 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 123 variances[i, j, :, k, :, l] = np.transpose( 124 np.loadtxt(get_windows_path( 125 get_filename(run_opts.run_folder_name, 'variance', res, train, noise, reg_train_time)), 126 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 127 for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)): 128 valid_time[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(get_windows_path( 129 get_filename(run_opts.run_folder_name, 'valid_time', res, train, noise, reg_train_time, 130 test_idx=test)), 131 delimiter=',')).reshape( 132 (num_vt_tests, reg_values.size)) 133 if run_opts.pmap: 134 pmap_max_wass_dist[i, j, :, k, :, l] = np.transpose(np.loadtxt( 135 get_windows_path(get_filename(run_opts.run_folder_name, 'pmap_max_wass_dist', res, train, noise, 136 reg_train_time)), 137 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 138 if run_opts.save_eigenvals: 139 eigenvals_in[i, j, k, l] = np.loadtxt( 140 get_windows_path( 141 get_filename(run_opts.run_folder_name, 'gradreg_eigenvals', res, train, noise, reg_train_time)), 142 delimiter=',') 143 if run_opts.save_time_rms: 144 for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)): 145 rms[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(get_windows_path( 146 get_filename(run_opts.run_folder_name, 'rms', res, train, noise, reg_train_time, 147 test_idx=test)), 148 delimiter=',')).reshape( 149 ((rkTime - split), reg_values.size)) 150 if run_opts.save_eigenvals: 151 eigenvals = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size, 152 eigenvals_in[0, 0, 0, 0].size)) 153 for i, j, k, l in product(np.arange(run_opts.res_per_test, dtype=int), 154 np.arange(train_vals.size, dtype=int), np.arange(noise_vals.size, dtype=int), 155 np.arange(reg_train_times.size, dtype=int)): 156 eigenvals[i, j, k, l] = eigenvals_in[i, j, k, l] 157 #print('Eigenvals shape:') 158 #print(eigenvals.shape) 159 else: 160 for (i, res), (j, train), (k, noise), (l, reg_train_time) in product( 161 enumerate(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test, dtype=int)), 162 enumerate(np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains, dtype=int)), 163 enumerate(noise_vals), enumerate(reg_train_times)): 164 stable_frac[i, j, k, :, l] = np.loadtxt( 165 get_filename(run_opts.run_folder_name, 'stable_frac', res, train, noise, reg_train_time), 166 delimiter=',') 167 train_mean_rms[i, j, k, :, l] = np.loadtxt( 168 get_filename(run_opts.run_folder_name, 'train_mean_rms', res, train, noise, reg_train_time), 169 delimiter=',') 170 train_max_rms[i, j, k, :, l] = np.loadtxt( 171 get_filename(run_opts.run_folder_name, 'train_max_rms', res, train, noise, reg_train_time), 172 delimiter=',') 173 mean_rms[i, j, :, k, :, l] = np.transpose( 174 np.loadtxt(get_filename(run_opts.run_folder_name, 'mean_rms', res, train, noise, reg_train_time), 175 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 176 max_rms[i, j, :, k, :, l] = np.transpose( 177 np.loadtxt(get_filename(run_opts.run_folder_name, 'max_rms', res, train, noise, reg_train_time), 178 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 179 variances[i, j, :, k, :, l] = np.transpose( 180 np.loadtxt(get_filename(run_opts.run_folder_name, 'variance', res, train, noise, reg_train_time), 181 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 182 for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)): 183 valid_time[i, j, m, k, :, :, l] = np.transpose(np.loadtxt( 184 get_filename(run_opts.run_folder_name, 'valid_time', res, train, noise, reg_train_time, 185 test_idx=test), 186 delimiter=',')).reshape( 187 (num_vt_tests, reg_values.size)) 188 if run_opts.pmap: 189 pmap_max_wass_dist[i, j, :, k, :, l] = np.transpose(np.loadtxt( 190 get_filename(run_opts.run_folder_name, 'pmap_max_wass_dist', res, train, noise, reg_train_time), 191 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 192 if run_opts.save_eigenvals: 193 eigenvals_in[i, j, k, l] = np.loadtxt( 194 get_filename(run_opts.run_folder_name, 'gradreg_eigenvals', res, train, noise, reg_train_time), 195 delimiter=',') 196 if run_opts.save_time_rms: 197 for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)): 198 rms[i, j, m, k, :, :, l] = np.transpose(np.loadtxt( 199 get_filename(run_opts.run_folder_name, 'rms', res, train, noise, reg_train_time, test_idx=test), 200 delimiter=',')).reshape( 201 ((rkTime - split), reg_values.size)) 202 203 if run_opts.save_eigenvals: 204 eigenvals = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size, 205 eigenvals_in[0, 0, 0, 0].size)) 206 for i, j, k, l in product(np.arange(run_opts.res_per_test, dtype=int), np.arange(train_vals.size, dtype=int), 207 np.arange(noise_vals.size, dtype=int), np.arange(reg_train_times.size, dtype=int)): 208 eigenvals[i, j, k, l] = eigenvals_in[i, j, k, l] 209 #print('Eigenvals shape:') 210 #print(eigenvals.shape) 211 load_toc = time.perf_counter() 212 print('All data loaded in %0.2f sec.' % (load_toc - load_tic)) 213 214 if run_opts.return_all: 215 216 save_filename = run_opts.run_file_name 217 if os.path.exists(save_filename): 218 saved_flag = True 219 print('Found data file with the same name. Loading...') 220 save_tic = time.perf_counter() 221 saved_data = pd.read_csv(save_filename, index_col=0) 222 save_toc = time.perf_counter() 223 print('Saved data loaded in %0.2f sec.' % (save_toc - save_tic)) 224 225 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx = np.meshgrid( 226 np.arange(run_opts.res_per_test, dtype=int), np.arange(train_vals.size, dtype=int), 227 np.arange(run_opts.num_tests, dtype=int), np.arange(noise_vals.size, dtype=int), 228 np.arange(reg_values.size, dtype=int), np.arange(reg_train_times.size, dtype=int)) 229 230 all_res_idx = all_res_idx.flatten() 231 all_train_idx = all_train_idx.flatten() 232 all_test_idx = all_test_idx.flatten() 233 all_noise_idx = all_noise_idx.flatten() 234 all_reg_idx = all_reg_idx.flatten() 235 all_reg_train_idx = all_reg_train_idx.flatten() 236 all_res = res_vals[all_res_idx] 237 all_train = train_vals[all_train_idx] 238 all_test = test_vals[all_test_idx] 239 all_noise = noise_vals[all_noise_idx] 240 all_reg = reg_values[all_reg_idx] 241 all_reg_train = reg_train_times[all_reg_train_idx] 242 243 data_dict = {'res': all_res, 244 'train': all_train, 245 'test': all_test, 246 'noise': all_noise, 247 'reg': all_reg, 248 'reg_train': all_reg_train} 249 250 data_out = pd.DataFrame(data_dict) 251 data_out['stable_frac'] = stable_frac[all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 252 data_out['train_mean_rms'] = train_mean_rms[ 253 all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 254 data_out['train_max_rms'] = train_max_rms[ 255 all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 256 data_out['mean_rms'] = mean_rms[ 257 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 258 data_out['max_rms'] = max_rms[ 259 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 260 data_out['variance'] = variances[ 261 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 262 data_out = pd.concat([data_out, pd.DataFrame( 263 valid_time[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_idx, all_reg_train_idx], 264 columns=['valid_time%d' % i for i in range(num_vt_tests)])], axis=1) 265 266 if run_opts.pmap: 267 data_out['pmap_max_wass_dist'] = pmap_max_wass_dist[ 268 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 269 if run_opts.save_eigenvals: 270 #print(data_out[all_test_idx == 0].shape) 271 #print(data_out[all_test_idx == 0][['res', 'train', 'test', 'noise', 'reg', 'reg_train']]) 272 #print(eigenvals[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx].shape) 273 #print(['eig%d' % (i + 1) for i in range(eigenvals.shape[-1])]) 274 eigenval_idx = (all_test_idx == 0) & (all_reg_idx == 0) 275 data_out.at[eigenval_idx, ['eig%d' % (i + 1) for i in range(eigenvals.shape[-1])]] = \ 276 eigenvals[all_res_idx[eigenval_idx], all_train_idx[eigenval_idx], all_noise_idx[eigenval_idx], \ 277 all_reg_train_idx[eigenval_idx]] 278 if run_opts.save_time_rms: 279 # data_out = pd.concat([data_out, pd.DataFrame(mean_all[all_res, all_train_idx, all_test, all_noise_idx, :, all_reg_idx],\ 280 # columns = ['mean_all%d' % i for i in range((rkTime-split)+1)])], axis = 1) 281 # print('Concatenated mean_all') 282 # data_out = pd.concat([data_out, pd.DataFrame(variances_all[all_res, all_train_idx, all_test, all_noise_idx, :, all_reg_idx],\ 283 # columns = ['variances_all%d' % i for i in range((rkTime-split)+1)])], axis = 1) 284 # print('Concatendated variances_all') 285 data_out = pd.concat([data_out, pd.DataFrame( 286 rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_idx, all_reg_train_idx], 287 columns=['rms%d' % (i + 1) for i in range((rkTime - split))])], axis=1) 288 #print('Concatenated rms') 289 290 if saved_flag: 291 saved_cols = saved_data.columns.to_list() 292 if set(saved_cols) != set(data_out.columns.to_list()): 293 print('Saved Data set of the same name does not contain the same type of data.') 294 print('Delete this file before running this code again.') 295 raise ValueError 296 data_out = pd.concat([data_out, saved_data], copy=False) 297 data_out.drop_duplicates(['res', 'train', 'test', 'noise', 'reg', 'reg_train'], inplace=True) 298 sort_tic = time.perf_counter() 299 data_out.sort_values(['res', 'train', 'test', 'noise', 'reg', 'reg_train'], inplace=True, ignore_index=True) 300 sort_toc = time.perf_counter() 301 print('Data sorted in %0.2f sec.' % (sort_toc - sort_tic)) 302 raw_data_size = float(data_out.memory_usage().sum()) 303 304 print('Compressing and saving data...') 305 save_tic = time.perf_counter() 306 data_out.to_csv(save_filename) 307 save_toc = time.perf_counter() 308 print('Time to compress and save data: %0.2f sec.' % ((save_toc - save_tic))) 309 310 # elif return_all and savepred: 311 # raise ValueError 312 else: 313 best_stable_frac = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size)) 314 best_train_mean_rms = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size)) 315 best_train_max_rms = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size)) 316 best_mean_rms = np.zeros( 317 (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size)) 318 best_max_rms = np.zeros( 319 (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size)) 320 best_variances = np.zeros( 321 (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size)) 322 best_valid_time = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, 323 num_vt_tests, reg_train_times.size)) 324 if run_opts.save_time_rms: 325 # best_mean_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1)) 326 # best_variances_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1)) 327 best_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, 328 (rkTime - split), reg_train_times.size)) 329 best_pmap_max_wass_dist = np.zeros( 330 (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size)) 331 stable_frac_alpha = np.zeros(noise_vals.size, reg_train_times.size) 332 best_j = np.zeros(noise_vals.size, reg_train_times.size) 333 for (i, noise), (k, reg_train_time) in product(enumerate(noise_vals), enumerate(reg_train_times)): 334 if run_opts.metric in ['mss_var', 'valid_time']: 335 best_alpha_val = 0 336 elif run_opts.metric in ['pmap_max_wass_dist', 'mean_rms', 'max_rms']: 337 best_alpha_val = np.inf 338 for j in range(reg_values.size): 339 if run_opts.metric == 'mss_var': 340 metric_flag = np.mean( 341 stable_frac[:, :, i, j, k]) > best_alpha_val 342 elif run_opts.metric == 'valid_time': 343 metric_flag = np.median(valid_time[:, :, :, i, :, j, k]) > best_alpha_val 344 elif run_opts.metric == 'pmap_max_wass_dist': 345 # print(j) 346 # print(np.mean(run_opts.pmap_max_wass_dist[:, i, :, :, j-1])) 347 metric_flag = np.mean( 348 run_opts.pmap_max_wass_dist[:, :, :, i, j, k]) <= best_alpha_val 349 elif run_opts.metric == 'mean_rms': 350 metric_flag = np.mean(mean_rms[:, :, :, i, j, k]) <= best_alpha_val 351 elif run_opts.metric == 'max_rms': 352 metric_flag = np.median(max_rms[:, :, :, i, j, k]) <= best_alpha_val 353 if metric_flag or (run_opts.metric in ['mss_var', 354 'valid_time'] and best_alpha_val == 0 and j == reg_values.size - 1) or \ 355 (run_opts.metric in ['pmap_max_wass_dist', 'mean_rms', 'max_rms'] 356 and np.isinf(best_alpha_val) and j == reg_values.size - 1): 357 if run_opts.metric == 'mss_var': 358 best_alpha_val = np.mean(stable_frac[:, :, i, j, k]) 359 elif run_opts.metric == 'valid_time': 360 best_alpha_val = np.median(valid_time[:, :, :, i, :, j, k]) 361 elif run_opts.metric == 'pmap_max_wass_dist': 362 best_alpha_val = np.mean( 363 run_opts.pmap_max_wass_dist[:, :, :, i, j, k]) 364 elif run_opts.metric == 'mean_rms': 365 best_alpha_val = np.mean(mean_rms[:, :, :, i, j, k]) 366 elif run_opts.metric == 'max_rms': 367 best_alpha_val = np.median(max_rms[:, :, :, i, j, k]) 368 best_stable_frac[:, :, :, i, k] = -stable_frac[:, :, i, j, k] 369 best_train_mean_rms[:, :, :, i, k] = train_mean_rms[:, :, i, j, k] 370 best_train_max_rms[:, :, :, i, k] = train_max_rms[:, :, i, j, k] 371 best_variances[:, :, :, i, k] = variances[:, :, :, i, j, k] 372 best_mean_rms[:, :, :, i, k] = mean_rms[:, :, :, i, j, k] 373 best_max_rms[:, :, :, i, k] = max_rms[:, :, :, i, j, k] 374 best_valid_time[:, :, :, i, :, k] = valid_time[:, :, :, i, :, j, k] 375 best_pmap_max_wass_dist[:, i] = pmap_max_wass_dist[:, :, :, i, j, k] 376 stable_frac_alpha[i, k] = reg_values[j] 377 best_j[i, k] = int(j) 378 if run_opts.save_time_rms: 379 # best_mean_all[:,:,:,i] = mean_all[:,:,:,i,:,j] 380 # best_variances_all[:,:,:,i] = variances_all[:,:,:,i,:,j] 381 best_rms[:, :, :, i, :, k] = rms[:, :, :, i, :, j, k] 382 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx = np.meshgrid( 383 np.arange(run_opts.res_per_test, dtype=int), 384 np.arange(train_vals.size, dtype=int), np.arange(noise_vals.size, dtype=int), 385 np.arange(run_opts.num_tests, dtype=int), np.arange(reg_train_times.size, dtype=int)) 386 387 all_res_idx = all_res_idx.flatten() 388 all_train_idx = all_train_idx.flatten() 389 all_test_idx = all_test_idx.flatten() 390 all_noise_idx = all_noise_idx.flatten() 391 all_reg_train_idx = all_reg_train_idx.flatten() 392 all_res = res_vals[all_res_idx] 393 all_train = train_vals[all_train_idx] 394 all_test = test_vals[all_test_idx] 395 all_noise = noise_vals[all_noise_idx] 396 all_reg = reg_values[best_j[all_noise_idx, all_reg_train_idx]] 397 all_reg_train = reg_train_times[all_reg_train_idx] 398 399 data_dict = {'res': all_res, 400 'train': all_train, 401 'test': all_test, 402 'noise': all_noise, 403 'reg': all_reg, 404 'reg_train': all_reg_train} 405 406 data_out = pd.DataFrame(data_dict) 407 data_out['stable_frac'] = best_stable_frac[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx] 408 data_out['train_mean_rms'] = best_train_mean_rms[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx] 409 data_out['train_max_rms'] = best_train_max_rms[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx] 410 data_out['mean_rms'] = best_mean_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx] 411 data_out['max_rms'] = best_max_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx] 412 data_out['variance'] = best_variances[ 413 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx] 414 data_out = pd.concat([data_out, pd.DataFrame( 415 best_valid_time[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_train_idx], 416 columns=['valid_time%d' % i for i in range(num_vt_tests)])], axis=1) 417 418 if run_opts.pmap: 419 data_out['pmap_max_wass_dist'] = best_pmap_max_wass_dist[ 420 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx] 421 if run_opts.save_time_rms: 422 # data_out = pd.concat([data_out, pd.DataFrame(best_mean_all[all_res, all_train_idx, all_test, all_noise_idx],\ 423 # columns = ['mean_all%d' % i for i in range((rkTime-split)+1)])], axis = 1) 424 # print('Concatenated mean_all') 425 # data_out = pd.concat([data_out, pd.DataFrame(best_variances_all[all_res, all_train_idx, all_test, all_noise_idx],\ 426 # columns = ['variances_all%d' % i for i in range((rkTime-split)+1)])], axis = 1) 427 # print('Concatenated variances_all') 428 data_out = pd.concat([data_out, pd.DataFrame( 429 best_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_train_idx], 430 columns=['rms%d' % i for i in range((rkTime - split))])], axis=1) 431 print('Concatenated rms') 432 433 print('Compressing and saving data...') 434 save_tic = time.perf_counter() 435 data_out.to_csv(run_opts.run_file_name) 436 save_toc = time.perf_counter() 437 print('Time to compress and save data: %0.2f sec.' % ((save_toc - save_tic))) 438 439 comp_data_size = os.stat(run_opts.run_file_name).st_size 440 441 if run_opts.savepred: 442 if run_opts.return_all: 443 pred_files = [get_filename(run_opts.run_folder_name, 'pred', res, train, noise, reg_train_time, reg=reg, 444 test_idx=test, just_file=True) \ 445 for res, train, test, noise, reg_train_time, reg in 446 zip(data_out['res'], data_out['train'], data_out['test'], 447 data_out['noise'], data_out['reg_train'], data_out['reg'])] 448 else: 449 noise_vals_set_idx, reg_train_times_set_idx = np.meshgrid(np.arange(noise_vals.size, dtype=int), 450 np.arange(reg_train_times.size, dtype=int)) 451 noise_vals_set_idx = noise_vals_set_idx.flatten() 452 reg_train_times_set_idx = reg_train_times_set_idx.flatten() 453 noise_vals_set = noise_vals[noise_vals_set_idx] 454 reg_train_times_set = reg_train_times[reg_train_times_set_idx] 455 pred_files = ['pred_res%d_train%d_test%d_noise%e_regtrain%d_reg%e.csv' % ( 456 res, train, test, noise, reg_train_time, reg) \ 457 for res, train, test, (noise, reg_train_time, reg) in 458 product(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test), 459 np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains), 460 np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests), 461 zip(noise_vals_set, reg_train_times_set, 462 reg_values[best_j[noise_vals_set_idx, reg_train_times_set_idx]]))] 463 #print('Pred file names') 464 #for file in pred_files: 465 # print(file) 466 all_files = os.listdir(run_opts.run_folder_name) 467 for file in all_files: 468 if file not in pred_files and 'true_test' not in file: 469 os.remove(os.path.join(run_opts.run_folder_name, file)) 470 pred_data_size = 0 471 for ele in os.scandir(run_opts.run_folder_name): 472 pred_data_size += os.stat(ele).st_size 473 raw_data_size += pred_data_size 474 comp_data_size += pred_data_size 475 else: 476 all_files = os.listdir(run_opts.run_folder_name) 477 if run_opts.pmap: 478 for file in all_files: 479 if os.path.isfile(os.path.join(run_opts.run_folder_name, 480 file)) and 'pmap_max_res' not in file and 'true_test' not in file: 481 os.remove(os.path.join(run_opts.run_folder_name, file)) 482 else: 483 for file in all_files: 484 if os.path.isfile(os.path.join(run_opts.run_folder_name, file)) and 'true_test' not in file: 485 os.remove(os.path.join(run_opts.run_folder_name, file)) 486 if len(os.listdir(run_opts.run_folder_name)) == 0: 487 os.rmdir(run_opts.run_folder_name) 488 print('Compressed data size: %0.3f kB' % (comp_data_size / 1000)) 489 print('Data compressed by %0.3f percent' % ((1. - comp_data_size / float(raw_data_size)) * 100)) 490 toc = time.perf_counter() 491 print('Compressed Results Saved in %f sec.' % (toc - tic)) 492 493 494def main(argv): 495 process_data(argv) 496 497 498if __name__ == "__main__": 499 main(sys.argv[1:])
def
process_data(argv=None, run_opts=None):
27def process_data(argv=None, run_opts=None): 28 saved_flag = False 29 30 tic = time.perf_counter() 31 if not isinstance(argv, type(None)) and isinstance(run_opts, type(None)): 32 run_opts = RunOpts(argv) 33 34 raw_data_size = 0 35 for ele in os.scandir(run_opts.run_folder_name): 36 raw_data_size += os.stat(ele).st_size 37 print('Raw data size: %0.2f kB' % (raw_data_size / 1000.)) 38 noise_vals = run_opts.noise_values_array 39 reg_values = run_opts.reg_values 40 reg_train_times = run_opts.reg_train_times 41 42 #print("Regularization training times:") 43 #print(reg_train_times) 44 #print(type(reg_train_times)) 45 #print(reg_train_times.shape) 46 47 rkTime = run_opts.test_time 48 split = run_opts.sync_time 49 num_vt_tests = (rkTime - split) // run_opts.max_valid_time 50 51 # def get_stability_output(out_full, data_path, filename, noise_indices, train_indices, res_per_test, run_opts.num_tests, reg_values, savepred, save_time_rms, run_opts.pmap, rkTime, split, metric = 'mss_var', return_all = False):#metric='pmap_max_wass_dist'): 52 # Function to process the output from all of the different reservoirs, trainning data sets, and noise values tested by find_stability. 53 # If return_all is True, this simply unpacks the linear output from the find_stability loop. 54 # If not, then this function returns only the results using the most optimal regularization (as defined by the metric) and using no regulariation. 55 res_vals = np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test) 56 train_vals = np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains) 57 test_vals = np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests) 58 #print('Trains:') 59 #print(train_vals) 60 #print('Noise:') 61 #print(noise_vals) 62 #print('Res') 63 #print(run_opts.res_per_test) 64 #print('Regs:') 65 #print(reg_values) 66 67 stable_frac = np.zeros( 68 (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size)) 69 train_mean_rms = np.zeros( 70 (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size)) 71 train_max_rms = np.zeros( 72 (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size)) 73 mean_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size, 74 reg_train_times.size)) 75 max_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size, 76 reg_train_times.size)) 77 variances = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size, 78 reg_train_times.size)) 79 valid_time = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, num_vt_tests, 80 reg_values.size, reg_train_times.size)) 81 if run_opts.save_time_rms: 82 # mean_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1, reg_values.size)) 83 # variances_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1, reg_values.size)) 84 rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime - split), 85 reg_values.size, reg_train_times.size)) 86 if run_opts.pmap: 87 pmap_max_wass_dist = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, 88 reg_values.size, reg_train_times.size)) 89 if run_opts.save_eigenvals: 90 eigenvals_in = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size), 91 dtype=object) 92 93 #print(np.arange(run_opts.res_per_test, dtype=int)) 94 #print(np.arange(run_opts.num_trains, dtype=int)) 95 #print(list(enumerate(noise_vals))) 96 #print(list(enumerate(reg_train_times))) 97 print('Loading in raw data...') 98 load_tic = time.perf_counter() 99 if os.name == 'nt': 100 for (i, res), (j, train), (k, noise), (l, reg_train_time) in product( 101 enumerate(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test, dtype=int)), 102 enumerate(np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains, dtype=int)), 103 enumerate(noise_vals), enumerate(reg_train_times)): 104 stable_frac[i, j, k, :, l] = np.loadtxt( 105 get_windows_path( 106 get_filename(run_opts.run_folder_name, 'stable_frac', res, train, noise, reg_train_time)), 107 delimiter=',') 108 train_mean_rms[i, j, k, :, l] = np.loadtxt( 109 get_windows_path( 110 get_filename(run_opts.run_folder_name, 'train_mean_rms', res, train, noise, reg_train_time)), 111 delimiter=',') 112 train_max_rms[i, j, k, :, l] = np.loadtxt( 113 get_windows_path( 114 get_filename(run_opts.run_folder_name, 'train_max_rms', res, train, noise, reg_train_time)), 115 delimiter=',') 116 mean_rms[i, j, :, k, :, l] = np.transpose( 117 np.loadtxt(get_windows_path( 118 get_filename(run_opts.run_folder_name, 'mean_rms', res, train, noise, reg_train_time)), 119 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 120 max_rms[i, j, :, k, :, l] = np.transpose( 121 np.loadtxt(get_windows_path( 122 get_filename(run_opts.run_folder_name, 'max_rms', res, train, noise, reg_train_time)), 123 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 124 variances[i, j, :, k, :, l] = np.transpose( 125 np.loadtxt(get_windows_path( 126 get_filename(run_opts.run_folder_name, 'variance', res, train, noise, reg_train_time)), 127 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 128 for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)): 129 valid_time[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(get_windows_path( 130 get_filename(run_opts.run_folder_name, 'valid_time', res, train, noise, reg_train_time, 131 test_idx=test)), 132 delimiter=',')).reshape( 133 (num_vt_tests, reg_values.size)) 134 if run_opts.pmap: 135 pmap_max_wass_dist[i, j, :, k, :, l] = np.transpose(np.loadtxt( 136 get_windows_path(get_filename(run_opts.run_folder_name, 'pmap_max_wass_dist', res, train, noise, 137 reg_train_time)), 138 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 139 if run_opts.save_eigenvals: 140 eigenvals_in[i, j, k, l] = np.loadtxt( 141 get_windows_path( 142 get_filename(run_opts.run_folder_name, 'gradreg_eigenvals', res, train, noise, reg_train_time)), 143 delimiter=',') 144 if run_opts.save_time_rms: 145 for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)): 146 rms[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(get_windows_path( 147 get_filename(run_opts.run_folder_name, 'rms', res, train, noise, reg_train_time, 148 test_idx=test)), 149 delimiter=',')).reshape( 150 ((rkTime - split), reg_values.size)) 151 if run_opts.save_eigenvals: 152 eigenvals = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size, 153 eigenvals_in[0, 0, 0, 0].size)) 154 for i, j, k, l in product(np.arange(run_opts.res_per_test, dtype=int), 155 np.arange(train_vals.size, dtype=int), np.arange(noise_vals.size, dtype=int), 156 np.arange(reg_train_times.size, dtype=int)): 157 eigenvals[i, j, k, l] = eigenvals_in[i, j, k, l] 158 #print('Eigenvals shape:') 159 #print(eigenvals.shape) 160 else: 161 for (i, res), (j, train), (k, noise), (l, reg_train_time) in product( 162 enumerate(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test, dtype=int)), 163 enumerate(np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains, dtype=int)), 164 enumerate(noise_vals), enumerate(reg_train_times)): 165 stable_frac[i, j, k, :, l] = np.loadtxt( 166 get_filename(run_opts.run_folder_name, 'stable_frac', res, train, noise, reg_train_time), 167 delimiter=',') 168 train_mean_rms[i, j, k, :, l] = np.loadtxt( 169 get_filename(run_opts.run_folder_name, 'train_mean_rms', res, train, noise, reg_train_time), 170 delimiter=',') 171 train_max_rms[i, j, k, :, l] = np.loadtxt( 172 get_filename(run_opts.run_folder_name, 'train_max_rms', res, train, noise, reg_train_time), 173 delimiter=',') 174 mean_rms[i, j, :, k, :, l] = np.transpose( 175 np.loadtxt(get_filename(run_opts.run_folder_name, 'mean_rms', res, train, noise, reg_train_time), 176 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 177 max_rms[i, j, :, k, :, l] = np.transpose( 178 np.loadtxt(get_filename(run_opts.run_folder_name, 'max_rms', res, train, noise, reg_train_time), 179 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 180 variances[i, j, :, k, :, l] = np.transpose( 181 np.loadtxt(get_filename(run_opts.run_folder_name, 'variance', res, train, noise, reg_train_time), 182 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 183 for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)): 184 valid_time[i, j, m, k, :, :, l] = np.transpose(np.loadtxt( 185 get_filename(run_opts.run_folder_name, 'valid_time', res, train, noise, reg_train_time, 186 test_idx=test), 187 delimiter=',')).reshape( 188 (num_vt_tests, reg_values.size)) 189 if run_opts.pmap: 190 pmap_max_wass_dist[i, j, :, k, :, l] = np.transpose(np.loadtxt( 191 get_filename(run_opts.run_folder_name, 'pmap_max_wass_dist', res, train, noise, reg_train_time), 192 delimiter=',')).reshape((run_opts.num_tests, reg_values.size)) 193 if run_opts.save_eigenvals: 194 eigenvals_in[i, j, k, l] = np.loadtxt( 195 get_filename(run_opts.run_folder_name, 'gradreg_eigenvals', res, train, noise, reg_train_time), 196 delimiter=',') 197 if run_opts.save_time_rms: 198 for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)): 199 rms[i, j, m, k, :, :, l] = np.transpose(np.loadtxt( 200 get_filename(run_opts.run_folder_name, 'rms', res, train, noise, reg_train_time, test_idx=test), 201 delimiter=',')).reshape( 202 ((rkTime - split), reg_values.size)) 203 204 if run_opts.save_eigenvals: 205 eigenvals = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size, 206 eigenvals_in[0, 0, 0, 0].size)) 207 for i, j, k, l in product(np.arange(run_opts.res_per_test, dtype=int), np.arange(train_vals.size, dtype=int), 208 np.arange(noise_vals.size, dtype=int), np.arange(reg_train_times.size, dtype=int)): 209 eigenvals[i, j, k, l] = eigenvals_in[i, j, k, l] 210 #print('Eigenvals shape:') 211 #print(eigenvals.shape) 212 load_toc = time.perf_counter() 213 print('All data loaded in %0.2f sec.' % (load_toc - load_tic)) 214 215 if run_opts.return_all: 216 217 save_filename = run_opts.run_file_name 218 if os.path.exists(save_filename): 219 saved_flag = True 220 print('Found data file with the same name. Loading...') 221 save_tic = time.perf_counter() 222 saved_data = pd.read_csv(save_filename, index_col=0) 223 save_toc = time.perf_counter() 224 print('Saved data loaded in %0.2f sec.' % (save_toc - save_tic)) 225 226 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx = np.meshgrid( 227 np.arange(run_opts.res_per_test, dtype=int), np.arange(train_vals.size, dtype=int), 228 np.arange(run_opts.num_tests, dtype=int), np.arange(noise_vals.size, dtype=int), 229 np.arange(reg_values.size, dtype=int), np.arange(reg_train_times.size, dtype=int)) 230 231 all_res_idx = all_res_idx.flatten() 232 all_train_idx = all_train_idx.flatten() 233 all_test_idx = all_test_idx.flatten() 234 all_noise_idx = all_noise_idx.flatten() 235 all_reg_idx = all_reg_idx.flatten() 236 all_reg_train_idx = all_reg_train_idx.flatten() 237 all_res = res_vals[all_res_idx] 238 all_train = train_vals[all_train_idx] 239 all_test = test_vals[all_test_idx] 240 all_noise = noise_vals[all_noise_idx] 241 all_reg = reg_values[all_reg_idx] 242 all_reg_train = reg_train_times[all_reg_train_idx] 243 244 data_dict = {'res': all_res, 245 'train': all_train, 246 'test': all_test, 247 'noise': all_noise, 248 'reg': all_reg, 249 'reg_train': all_reg_train} 250 251 data_out = pd.DataFrame(data_dict) 252 data_out['stable_frac'] = stable_frac[all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 253 data_out['train_mean_rms'] = train_mean_rms[ 254 all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 255 data_out['train_max_rms'] = train_max_rms[ 256 all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 257 data_out['mean_rms'] = mean_rms[ 258 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 259 data_out['max_rms'] = max_rms[ 260 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 261 data_out['variance'] = variances[ 262 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 263 data_out = pd.concat([data_out, pd.DataFrame( 264 valid_time[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_idx, all_reg_train_idx], 265 columns=['valid_time%d' % i for i in range(num_vt_tests)])], axis=1) 266 267 if run_opts.pmap: 268 data_out['pmap_max_wass_dist'] = pmap_max_wass_dist[ 269 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx] 270 if run_opts.save_eigenvals: 271 #print(data_out[all_test_idx == 0].shape) 272 #print(data_out[all_test_idx == 0][['res', 'train', 'test', 'noise', 'reg', 'reg_train']]) 273 #print(eigenvals[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx].shape) 274 #print(['eig%d' % (i + 1) for i in range(eigenvals.shape[-1])]) 275 eigenval_idx = (all_test_idx == 0) & (all_reg_idx == 0) 276 data_out.at[eigenval_idx, ['eig%d' % (i + 1) for i in range(eigenvals.shape[-1])]] = \ 277 eigenvals[all_res_idx[eigenval_idx], all_train_idx[eigenval_idx], all_noise_idx[eigenval_idx], \ 278 all_reg_train_idx[eigenval_idx]] 279 if run_opts.save_time_rms: 280 # data_out = pd.concat([data_out, pd.DataFrame(mean_all[all_res, all_train_idx, all_test, all_noise_idx, :, all_reg_idx],\ 281 # columns = ['mean_all%d' % i for i in range((rkTime-split)+1)])], axis = 1) 282 # print('Concatenated mean_all') 283 # data_out = pd.concat([data_out, pd.DataFrame(variances_all[all_res, all_train_idx, all_test, all_noise_idx, :, all_reg_idx],\ 284 # columns = ['variances_all%d' % i for i in range((rkTime-split)+1)])], axis = 1) 285 # print('Concatendated variances_all') 286 data_out = pd.concat([data_out, pd.DataFrame( 287 rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_idx, all_reg_train_idx], 288 columns=['rms%d' % (i + 1) for i in range((rkTime - split))])], axis=1) 289 #print('Concatenated rms') 290 291 if saved_flag: 292 saved_cols = saved_data.columns.to_list() 293 if set(saved_cols) != set(data_out.columns.to_list()): 294 print('Saved Data set of the same name does not contain the same type of data.') 295 print('Delete this file before running this code again.') 296 raise ValueError 297 data_out = pd.concat([data_out, saved_data], copy=False) 298 data_out.drop_duplicates(['res', 'train', 'test', 'noise', 'reg', 'reg_train'], inplace=True) 299 sort_tic = time.perf_counter() 300 data_out.sort_values(['res', 'train', 'test', 'noise', 'reg', 'reg_train'], inplace=True, ignore_index=True) 301 sort_toc = time.perf_counter() 302 print('Data sorted in %0.2f sec.' % (sort_toc - sort_tic)) 303 raw_data_size = float(data_out.memory_usage().sum()) 304 305 print('Compressing and saving data...') 306 save_tic = time.perf_counter() 307 data_out.to_csv(save_filename) 308 save_toc = time.perf_counter() 309 print('Time to compress and save data: %0.2f sec.' % ((save_toc - save_tic))) 310 311 # elif return_all and savepred: 312 # raise ValueError 313 else: 314 best_stable_frac = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size)) 315 best_train_mean_rms = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size)) 316 best_train_max_rms = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size)) 317 best_mean_rms = np.zeros( 318 (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size)) 319 best_max_rms = np.zeros( 320 (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size)) 321 best_variances = np.zeros( 322 (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size)) 323 best_valid_time = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, 324 num_vt_tests, reg_train_times.size)) 325 if run_opts.save_time_rms: 326 # best_mean_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1)) 327 # best_variances_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1)) 328 best_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, 329 (rkTime - split), reg_train_times.size)) 330 best_pmap_max_wass_dist = np.zeros( 331 (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size)) 332 stable_frac_alpha = np.zeros(noise_vals.size, reg_train_times.size) 333 best_j = np.zeros(noise_vals.size, reg_train_times.size) 334 for (i, noise), (k, reg_train_time) in product(enumerate(noise_vals), enumerate(reg_train_times)): 335 if run_opts.metric in ['mss_var', 'valid_time']: 336 best_alpha_val = 0 337 elif run_opts.metric in ['pmap_max_wass_dist', 'mean_rms', 'max_rms']: 338 best_alpha_val = np.inf 339 for j in range(reg_values.size): 340 if run_opts.metric == 'mss_var': 341 metric_flag = np.mean( 342 stable_frac[:, :, i, j, k]) > best_alpha_val 343 elif run_opts.metric == 'valid_time': 344 metric_flag = np.median(valid_time[:, :, :, i, :, j, k]) > best_alpha_val 345 elif run_opts.metric == 'pmap_max_wass_dist': 346 # print(j) 347 # print(np.mean(run_opts.pmap_max_wass_dist[:, i, :, :, j-1])) 348 metric_flag = np.mean( 349 run_opts.pmap_max_wass_dist[:, :, :, i, j, k]) <= best_alpha_val 350 elif run_opts.metric == 'mean_rms': 351 metric_flag = np.mean(mean_rms[:, :, :, i, j, k]) <= best_alpha_val 352 elif run_opts.metric == 'max_rms': 353 metric_flag = np.median(max_rms[:, :, :, i, j, k]) <= best_alpha_val 354 if metric_flag or (run_opts.metric in ['mss_var', 355 'valid_time'] and best_alpha_val == 0 and j == reg_values.size - 1) or \ 356 (run_opts.metric in ['pmap_max_wass_dist', 'mean_rms', 'max_rms'] 357 and np.isinf(best_alpha_val) and j == reg_values.size - 1): 358 if run_opts.metric == 'mss_var': 359 best_alpha_val = np.mean(stable_frac[:, :, i, j, k]) 360 elif run_opts.metric == 'valid_time': 361 best_alpha_val = np.median(valid_time[:, :, :, i, :, j, k]) 362 elif run_opts.metric == 'pmap_max_wass_dist': 363 best_alpha_val = np.mean( 364 run_opts.pmap_max_wass_dist[:, :, :, i, j, k]) 365 elif run_opts.metric == 'mean_rms': 366 best_alpha_val = np.mean(mean_rms[:, :, :, i, j, k]) 367 elif run_opts.metric == 'max_rms': 368 best_alpha_val = np.median(max_rms[:, :, :, i, j, k]) 369 best_stable_frac[:, :, :, i, k] = -stable_frac[:, :, i, j, k] 370 best_train_mean_rms[:, :, :, i, k] = train_mean_rms[:, :, i, j, k] 371 best_train_max_rms[:, :, :, i, k] = train_max_rms[:, :, i, j, k] 372 best_variances[:, :, :, i, k] = variances[:, :, :, i, j, k] 373 best_mean_rms[:, :, :, i, k] = mean_rms[:, :, :, i, j, k] 374 best_max_rms[:, :, :, i, k] = max_rms[:, :, :, i, j, k] 375 best_valid_time[:, :, :, i, :, k] = valid_time[:, :, :, i, :, j, k] 376 best_pmap_max_wass_dist[:, i] = pmap_max_wass_dist[:, :, :, i, j, k] 377 stable_frac_alpha[i, k] = reg_values[j] 378 best_j[i, k] = int(j) 379 if run_opts.save_time_rms: 380 # best_mean_all[:,:,:,i] = mean_all[:,:,:,i,:,j] 381 # best_variances_all[:,:,:,i] = variances_all[:,:,:,i,:,j] 382 best_rms[:, :, :, i, :, k] = rms[:, :, :, i, :, j, k] 383 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx = np.meshgrid( 384 np.arange(run_opts.res_per_test, dtype=int), 385 np.arange(train_vals.size, dtype=int), np.arange(noise_vals.size, dtype=int), 386 np.arange(run_opts.num_tests, dtype=int), np.arange(reg_train_times.size, dtype=int)) 387 388 all_res_idx = all_res_idx.flatten() 389 all_train_idx = all_train_idx.flatten() 390 all_test_idx = all_test_idx.flatten() 391 all_noise_idx = all_noise_idx.flatten() 392 all_reg_train_idx = all_reg_train_idx.flatten() 393 all_res = res_vals[all_res_idx] 394 all_train = train_vals[all_train_idx] 395 all_test = test_vals[all_test_idx] 396 all_noise = noise_vals[all_noise_idx] 397 all_reg = reg_values[best_j[all_noise_idx, all_reg_train_idx]] 398 all_reg_train = reg_train_times[all_reg_train_idx] 399 400 data_dict = {'res': all_res, 401 'train': all_train, 402 'test': all_test, 403 'noise': all_noise, 404 'reg': all_reg, 405 'reg_train': all_reg_train} 406 407 data_out = pd.DataFrame(data_dict) 408 data_out['stable_frac'] = best_stable_frac[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx] 409 data_out['train_mean_rms'] = best_train_mean_rms[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx] 410 data_out['train_max_rms'] = best_train_max_rms[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx] 411 data_out['mean_rms'] = best_mean_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx] 412 data_out['max_rms'] = best_max_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx] 413 data_out['variance'] = best_variances[ 414 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx] 415 data_out = pd.concat([data_out, pd.DataFrame( 416 best_valid_time[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_train_idx], 417 columns=['valid_time%d' % i for i in range(num_vt_tests)])], axis=1) 418 419 if run_opts.pmap: 420 data_out['pmap_max_wass_dist'] = best_pmap_max_wass_dist[ 421 all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx] 422 if run_opts.save_time_rms: 423 # data_out = pd.concat([data_out, pd.DataFrame(best_mean_all[all_res, all_train_idx, all_test, all_noise_idx],\ 424 # columns = ['mean_all%d' % i for i in range((rkTime-split)+1)])], axis = 1) 425 # print('Concatenated mean_all') 426 # data_out = pd.concat([data_out, pd.DataFrame(best_variances_all[all_res, all_train_idx, all_test, all_noise_idx],\ 427 # columns = ['variances_all%d' % i for i in range((rkTime-split)+1)])], axis = 1) 428 # print('Concatenated variances_all') 429 data_out = pd.concat([data_out, pd.DataFrame( 430 best_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_train_idx], 431 columns=['rms%d' % i for i in range((rkTime - split))])], axis=1) 432 print('Concatenated rms') 433 434 print('Compressing and saving data...') 435 save_tic = time.perf_counter() 436 data_out.to_csv(run_opts.run_file_name) 437 save_toc = time.perf_counter() 438 print('Time to compress and save data: %0.2f sec.' % ((save_toc - save_tic))) 439 440 comp_data_size = os.stat(run_opts.run_file_name).st_size 441 442 if run_opts.savepred: 443 if run_opts.return_all: 444 pred_files = [get_filename(run_opts.run_folder_name, 'pred', res, train, noise, reg_train_time, reg=reg, 445 test_idx=test, just_file=True) \ 446 for res, train, test, noise, reg_train_time, reg in 447 zip(data_out['res'], data_out['train'], data_out['test'], 448 data_out['noise'], data_out['reg_train'], data_out['reg'])] 449 else: 450 noise_vals_set_idx, reg_train_times_set_idx = np.meshgrid(np.arange(noise_vals.size, dtype=int), 451 np.arange(reg_train_times.size, dtype=int)) 452 noise_vals_set_idx = noise_vals_set_idx.flatten() 453 reg_train_times_set_idx = reg_train_times_set_idx.flatten() 454 noise_vals_set = noise_vals[noise_vals_set_idx] 455 reg_train_times_set = reg_train_times[reg_train_times_set_idx] 456 pred_files = ['pred_res%d_train%d_test%d_noise%e_regtrain%d_reg%e.csv' % ( 457 res, train, test, noise, reg_train_time, reg) \ 458 for res, train, test, (noise, reg_train_time, reg) in 459 product(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test), 460 np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains), 461 np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests), 462 zip(noise_vals_set, reg_train_times_set, 463 reg_values[best_j[noise_vals_set_idx, reg_train_times_set_idx]]))] 464 #print('Pred file names') 465 #for file in pred_files: 466 # print(file) 467 all_files = os.listdir(run_opts.run_folder_name) 468 for file in all_files: 469 if file not in pred_files and 'true_test' not in file: 470 os.remove(os.path.join(run_opts.run_folder_name, file)) 471 pred_data_size = 0 472 for ele in os.scandir(run_opts.run_folder_name): 473 pred_data_size += os.stat(ele).st_size 474 raw_data_size += pred_data_size 475 comp_data_size += pred_data_size 476 else: 477 all_files = os.listdir(run_opts.run_folder_name) 478 if run_opts.pmap: 479 for file in all_files: 480 if os.path.isfile(os.path.join(run_opts.run_folder_name, 481 file)) and 'pmap_max_res' not in file and 'true_test' not in file: 482 os.remove(os.path.join(run_opts.run_folder_name, file)) 483 else: 484 for file in all_files: 485 if os.path.isfile(os.path.join(run_opts.run_folder_name, file)) and 'true_test' not in file: 486 os.remove(os.path.join(run_opts.run_folder_name, file)) 487 if len(os.listdir(run_opts.run_folder_name)) == 0: 488 os.rmdir(run_opts.run_folder_name) 489 print('Compressed data size: %0.3f kB' % (comp_data_size / 1000)) 490 print('Data compressed by %0.3f percent' % ((1. - comp_data_size / float(raw_data_size)) * 100)) 491 toc = time.perf_counter() 492 print('Compressed Results Saved in %f sec.' % (toc - tic))
def
main(argv):