res_reg_lmnt_awikner.process_test_data

  1#!/homes/awikner1/anaconda3/envs/res39/bin/python -u
  2# Assume will be finished in no more than 18 hours
  3# SBATCH -d afterok:{{JOB_ID}}
  4# SBATCH -J {{JOB_NAME}}
  5# SBATCH --output=log_files/{{JOB_NAME}}.log
  6# SBATCH -t 4:00:00
  7# SBATCH -A {{ACCOUNT}}
  8# Launch on 12 cores distributed over as many nodes as needed
  9# SBATCH --ntasks=1
 10# Assume need 6 GB/core (6144 MB/core)
 11# SBATCH --mem-per-cpu=6144
 12# SBATCH --mail-user=awikner1@umd.edu
 13# SBATCH --mail-type=BEGIN
 14# SBATCH --mail-type=END
 15from itertools import product
 16import sys
 17import os
 18import numpy as np
 19import pandas as pd
 20import time
 21
 22from res_reg_lmnt_awikner.classes import RunOpts
 23from res_reg_lmnt_awikner.helpers import get_windows_path, get_filename
 24
 25
 26def process_data(argv=None, run_opts=None):
 27    saved_flag = False
 28
 29    tic = time.perf_counter()
 30    if not isinstance(argv, type(None)) and isinstance(run_opts, type(None)):
 31        run_opts = RunOpts(argv)
 32
 33    raw_data_size = 0
 34    for ele in os.scandir(run_opts.run_folder_name):
 35        raw_data_size += os.stat(ele).st_size
 36    print('Raw data size: %0.2f kB' % (raw_data_size / 1000.))
 37    noise_vals = run_opts.noise_values_array
 38    reg_values = run_opts.reg_values
 39    reg_train_times = run_opts.reg_train_times
 40
 41    #print("Regularization training times:")
 42    #print(reg_train_times)
 43    #print(type(reg_train_times))
 44    #print(reg_train_times.shape)
 45
 46    rkTime = run_opts.test_time
 47    split = run_opts.sync_time
 48    num_vt_tests = (rkTime - split) // run_opts.max_valid_time
 49
 50    # def get_stability_output(out_full, data_path, filename, noise_indices, train_indices, res_per_test, run_opts.num_tests, reg_values, savepred, save_time_rms, run_opts.pmap, rkTime, split, metric = 'mss_var', return_all = False):#metric='pmap_max_wass_dist'):
 51    # Function to process the output from all of the different reservoirs, trainning data sets, and noise values tested by find_stability.
 52    # If return_all is True, this simply unpacks the linear output from the find_stability loop.
 53    # If not, then this function returns only the results using the most optimal regularization (as defined by the metric) and using no regulariation.
 54    res_vals = np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test)
 55    train_vals = np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains)
 56    test_vals = np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)
 57    #print('Trains:')
 58    #print(train_vals)
 59    #print('Noise:')
 60    #print(noise_vals)
 61    #print('Res')
 62    #print(run_opts.res_per_test)
 63    #print('Regs:')
 64    #print(reg_values)
 65
 66    stable_frac = np.zeros(
 67        (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size))
 68    train_mean_rms = np.zeros(
 69        (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size))
 70    train_max_rms = np.zeros(
 71        (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size))
 72    mean_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size,
 73                         reg_train_times.size))
 74    max_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size,
 75                        reg_train_times.size))
 76    variances = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size,
 77                          reg_train_times.size))
 78    valid_time = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, num_vt_tests,
 79                           reg_values.size, reg_train_times.size))
 80    if run_opts.save_time_rms:
 81        # mean_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1, reg_values.size))
 82        # variances_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1, reg_values.size))
 83        rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime - split),
 84                        reg_values.size, reg_train_times.size))
 85    if run_opts.pmap:
 86        pmap_max_wass_dist = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size,
 87                                       reg_values.size, reg_train_times.size))
 88    if run_opts.save_eigenvals:
 89        eigenvals_in = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size),
 90                                dtype=object)
 91
 92    #print(np.arange(run_opts.res_per_test, dtype=int))
 93    #print(np.arange(run_opts.num_trains, dtype=int))
 94    #print(list(enumerate(noise_vals)))
 95    #print(list(enumerate(reg_train_times)))
 96    print('Loading in raw data...')
 97    load_tic = time.perf_counter()
 98    if os.name == 'nt':
 99        for (i, res), (j, train), (k, noise), (l, reg_train_time) in product(
100                enumerate(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test, dtype=int)),
101                enumerate(np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains, dtype=int)),
102                enumerate(noise_vals), enumerate(reg_train_times)):
103            stable_frac[i, j, k, :, l] = np.loadtxt(
104                get_windows_path(
105                    get_filename(run_opts.run_folder_name, 'stable_frac', res, train, noise, reg_train_time)),
106                delimiter=',')
107            train_mean_rms[i, j, k, :, l] = np.loadtxt(
108                get_windows_path(
109                    get_filename(run_opts.run_folder_name, 'train_mean_rms', res, train, noise, reg_train_time)),
110                delimiter=',')
111            train_max_rms[i, j, k, :, l] = np.loadtxt(
112                get_windows_path(
113                    get_filename(run_opts.run_folder_name, 'train_max_rms', res, train, noise, reg_train_time)),
114                delimiter=',')
115            mean_rms[i, j, :, k, :, l] = np.transpose(
116                np.loadtxt(get_windows_path(
117                    get_filename(run_opts.run_folder_name, 'mean_rms', res, train, noise, reg_train_time)),
118                           delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
119            max_rms[i, j, :, k, :, l] = np.transpose(
120                np.loadtxt(get_windows_path(
121                    get_filename(run_opts.run_folder_name, 'max_rms', res, train, noise, reg_train_time)),
122                           delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
123            variances[i, j, :, k, :, l] = np.transpose(
124                np.loadtxt(get_windows_path(
125                    get_filename(run_opts.run_folder_name, 'variance', res, train, noise, reg_train_time)),
126                           delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
127            for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)):
128                valid_time[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(get_windows_path(
129                    get_filename(run_opts.run_folder_name, 'valid_time', res, train, noise, reg_train_time,
130                                 test_idx=test)),
131                    delimiter=',')).reshape(
132                    (num_vt_tests, reg_values.size))
133            if run_opts.pmap:
134                pmap_max_wass_dist[i, j, :, k, :, l] = np.transpose(np.loadtxt(
135                    get_windows_path(get_filename(run_opts.run_folder_name, 'pmap_max_wass_dist', res, train, noise,
136                                                  reg_train_time)),
137                    delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
138            if run_opts.save_eigenvals:
139                eigenvals_in[i, j, k, l] = np.loadtxt(
140                    get_windows_path(
141                        get_filename(run_opts.run_folder_name, 'gradreg_eigenvals', res, train, noise, reg_train_time)),
142                    delimiter=',')
143            if run_opts.save_time_rms:
144                for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)):
145                    rms[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(get_windows_path(
146                        get_filename(run_opts.run_folder_name, 'rms', res, train, noise, reg_train_time,
147                                     test_idx=test)),
148                        delimiter=',')).reshape(
149                        ((rkTime - split), reg_values.size))
150        if run_opts.save_eigenvals:
151            eigenvals = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size,
152                                  eigenvals_in[0, 0, 0, 0].size))
153            for i, j, k, l in product(np.arange(run_opts.res_per_test, dtype=int),
154                                      np.arange(train_vals.size, dtype=int), np.arange(noise_vals.size, dtype=int),
155                                      np.arange(reg_train_times.size, dtype=int)):
156                eigenvals[i, j, k, l] = eigenvals_in[i, j, k, l]
157            #print('Eigenvals shape:')
158            #print(eigenvals.shape)
159    else:
160        for (i, res), (j, train), (k, noise), (l, reg_train_time) in product(
161                enumerate(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test, dtype=int)),
162                enumerate(np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains, dtype=int)),
163                enumerate(noise_vals), enumerate(reg_train_times)):
164            stable_frac[i, j, k, :, l] = np.loadtxt(
165                get_filename(run_opts.run_folder_name, 'stable_frac', res, train, noise, reg_train_time),
166                delimiter=',')
167            train_mean_rms[i, j, k, :, l] = np.loadtxt(
168                get_filename(run_opts.run_folder_name, 'train_mean_rms', res, train, noise, reg_train_time),
169                delimiter=',')
170            train_max_rms[i, j, k, :, l] = np.loadtxt(
171                get_filename(run_opts.run_folder_name, 'train_max_rms', res, train, noise, reg_train_time),
172                delimiter=',')
173            mean_rms[i, j, :, k, :, l] = np.transpose(
174                np.loadtxt(get_filename(run_opts.run_folder_name, 'mean_rms', res, train, noise, reg_train_time),
175                delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
176            max_rms[i, j, :, k, :, l] = np.transpose(
177                np.loadtxt(get_filename(run_opts.run_folder_name, 'max_rms', res, train, noise, reg_train_time),
178                delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
179            variances[i, j, :, k, :, l] = np.transpose(
180                np.loadtxt(get_filename(run_opts.run_folder_name, 'variance', res, train, noise, reg_train_time),
181                delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
182            for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)):
183                valid_time[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(
184                    get_filename(run_opts.run_folder_name, 'valid_time', res, train, noise, reg_train_time,
185                                 test_idx=test),
186                    delimiter=',')).reshape(
187                    (num_vt_tests, reg_values.size))
188            if run_opts.pmap:
189                pmap_max_wass_dist[i, j, :, k, :, l] = np.transpose(np.loadtxt(
190                    get_filename(run_opts.run_folder_name, 'pmap_max_wass_dist', res, train, noise, reg_train_time),
191                    delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
192            if run_opts.save_eigenvals:
193                eigenvals_in[i, j, k, l] = np.loadtxt(
194                    get_filename(run_opts.run_folder_name, 'gradreg_eigenvals', res, train, noise, reg_train_time),
195                    delimiter=',')
196            if run_opts.save_time_rms:
197                for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)):
198                    rms[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(
199                        get_filename(run_opts.run_folder_name, 'rms', res, train, noise, reg_train_time, test_idx=test),
200                        delimiter=',')).reshape(
201                        ((rkTime - split), reg_values.size))
202
203    if run_opts.save_eigenvals:
204        eigenvals = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size,
205                              eigenvals_in[0, 0, 0, 0].size))
206        for i, j, k, l in product(np.arange(run_opts.res_per_test, dtype=int), np.arange(train_vals.size, dtype=int),
207                                  np.arange(noise_vals.size, dtype=int), np.arange(reg_train_times.size, dtype=int)):
208            eigenvals[i, j, k, l] = eigenvals_in[i, j, k, l]
209        #print('Eigenvals shape:')
210        #print(eigenvals.shape)
211    load_toc = time.perf_counter()
212    print('All data loaded in %0.2f sec.' % (load_toc - load_tic))
213
214    if run_opts.return_all:
215
216        save_filename = run_opts.run_file_name
217        if os.path.exists(save_filename):
218            saved_flag = True
219            print('Found data file with the same name. Loading...')
220            save_tic = time.perf_counter()
221            saved_data = pd.read_csv(save_filename, index_col=0)
222            save_toc = time.perf_counter()
223            print('Saved data loaded in %0.2f sec.' % (save_toc - save_tic))
224
225        all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx = np.meshgrid(
226            np.arange(run_opts.res_per_test, dtype=int), np.arange(train_vals.size, dtype=int),
227            np.arange(run_opts.num_tests, dtype=int), np.arange(noise_vals.size, dtype=int),
228            np.arange(reg_values.size, dtype=int), np.arange(reg_train_times.size, dtype=int))
229
230        all_res_idx = all_res_idx.flatten()
231        all_train_idx = all_train_idx.flatten()
232        all_test_idx = all_test_idx.flatten()
233        all_noise_idx = all_noise_idx.flatten()
234        all_reg_idx = all_reg_idx.flatten()
235        all_reg_train_idx = all_reg_train_idx.flatten()
236        all_res = res_vals[all_res_idx]
237        all_train = train_vals[all_train_idx]
238        all_test = test_vals[all_test_idx]
239        all_noise = noise_vals[all_noise_idx]
240        all_reg = reg_values[all_reg_idx]
241        all_reg_train = reg_train_times[all_reg_train_idx]
242
243        data_dict = {'res': all_res,
244                     'train': all_train,
245                     'test': all_test,
246                     'noise': all_noise,
247                     'reg': all_reg,
248                     'reg_train': all_reg_train}
249
250        data_out = pd.DataFrame(data_dict)
251        data_out['stable_frac'] = stable_frac[all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
252        data_out['train_mean_rms'] = train_mean_rms[
253            all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
254        data_out['train_max_rms'] = train_max_rms[
255            all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
256        data_out['mean_rms'] = mean_rms[
257            all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
258        data_out['max_rms'] = max_rms[
259            all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
260        data_out['variance'] = variances[
261            all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
262        data_out = pd.concat([data_out, pd.DataFrame(
263            valid_time[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_idx, all_reg_train_idx],
264            columns=['valid_time%d' % i for i in range(num_vt_tests)])], axis=1)
265
266        if run_opts.pmap:
267            data_out['pmap_max_wass_dist'] = pmap_max_wass_dist[
268                all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
269        if run_opts.save_eigenvals:
270            #print(data_out[all_test_idx == 0].shape)
271            #print(data_out[all_test_idx == 0][['res', 'train', 'test', 'noise', 'reg', 'reg_train']])
272            #print(eigenvals[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx].shape)
273            #print(['eig%d' % (i + 1) for i in range(eigenvals.shape[-1])])
274            eigenval_idx = (all_test_idx == 0) & (all_reg_idx == 0)
275            data_out.at[eigenval_idx, ['eig%d' % (i + 1) for i in range(eigenvals.shape[-1])]] = \
276                eigenvals[all_res_idx[eigenval_idx], all_train_idx[eigenval_idx], all_noise_idx[eigenval_idx], \
277                          all_reg_train_idx[eigenval_idx]]
278        if run_opts.save_time_rms:
279            # data_out = pd.concat([data_out, pd.DataFrame(mean_all[all_res, all_train_idx, all_test, all_noise_idx, :, all_reg_idx],\
280            #        columns = ['mean_all%d' % i for i in range((rkTime-split)+1)])], axis = 1)
281            # print('Concatenated mean_all')
282            # data_out = pd.concat([data_out, pd.DataFrame(variances_all[all_res, all_train_idx, all_test, all_noise_idx, :, all_reg_idx],\
283            #        columns = ['variances_all%d' % i for i in range((rkTime-split)+1)])], axis = 1)
284            # print('Concatendated variances_all')
285            data_out = pd.concat([data_out, pd.DataFrame(
286                rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_idx, all_reg_train_idx],
287                columns=['rms%d' % (i + 1) for i in range((rkTime - split))])], axis=1)
288            #print('Concatenated rms')
289
290        if saved_flag:
291            saved_cols = saved_data.columns.to_list()
292            if set(saved_cols) != set(data_out.columns.to_list()):
293                print('Saved Data set of the same name does not contain the same type of data.')
294                print('Delete this file before running this code again.')
295                raise ValueError
296            data_out = pd.concat([data_out, saved_data], copy=False)
297            data_out.drop_duplicates(['res', 'train', 'test', 'noise', 'reg', 'reg_train'], inplace=True)
298            sort_tic = time.perf_counter()
299            data_out.sort_values(['res', 'train', 'test', 'noise', 'reg', 'reg_train'], inplace=True, ignore_index=True)
300            sort_toc = time.perf_counter()
301            print('Data sorted in %0.2f sec.' % (sort_toc - sort_tic))
302            raw_data_size = float(data_out.memory_usage().sum())
303
304        print('Compressing and saving data...')
305        save_tic = time.perf_counter()
306        data_out.to_csv(save_filename)
307        save_toc = time.perf_counter()
308        print('Time to compress and save data: %0.2f sec.' % ((save_toc - save_tic)))
309
310    # elif return_all and savepred:
311    #    raise ValueError
312    else:
313        best_stable_frac = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size))
314        best_train_mean_rms = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size))
315        best_train_max_rms = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size))
316        best_mean_rms = np.zeros(
317            (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size))
318        best_max_rms = np.zeros(
319            (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size))
320        best_variances = np.zeros(
321            (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size))
322        best_valid_time = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size,
323                                    num_vt_tests, reg_train_times.size))
324        if run_opts.save_time_rms:
325            # best_mean_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1))
326            # best_variances_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1))
327            best_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size,
328                                 (rkTime - split), reg_train_times.size))
329        best_pmap_max_wass_dist = np.zeros(
330            (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size))
331        stable_frac_alpha = np.zeros(noise_vals.size, reg_train_times.size)
332        best_j = np.zeros(noise_vals.size, reg_train_times.size)
333        for (i, noise), (k, reg_train_time) in product(enumerate(noise_vals), enumerate(reg_train_times)):
334            if run_opts.metric in ['mss_var', 'valid_time']:
335                best_alpha_val = 0
336            elif run_opts.metric in ['pmap_max_wass_dist', 'mean_rms', 'max_rms']:
337                best_alpha_val = np.inf
338            for j in range(reg_values.size):
339                if run_opts.metric == 'mss_var':
340                    metric_flag = np.mean(
341                        stable_frac[:, :, i, j, k]) > best_alpha_val
342                elif run_opts.metric == 'valid_time':
343                    metric_flag = np.median(valid_time[:, :, :, i, :, j, k]) > best_alpha_val
344                elif run_opts.metric == 'pmap_max_wass_dist':
345                    # print(j)
346                    # print(np.mean(run_opts.pmap_max_wass_dist[:, i, :, :, j-1]))
347                    metric_flag = np.mean(
348                        run_opts.pmap_max_wass_dist[:, :, :, i, j, k]) <= best_alpha_val
349                elif run_opts.metric == 'mean_rms':
350                    metric_flag = np.mean(mean_rms[:, :, :, i, j, k]) <= best_alpha_val
351                elif run_opts.metric == 'max_rms':
352                    metric_flag = np.median(max_rms[:, :, :, i, j, k]) <= best_alpha_val
353                if metric_flag or (run_opts.metric in ['mss_var',
354                                                       'valid_time'] and best_alpha_val == 0 and j == reg_values.size - 1) or \
355                        (run_opts.metric in ['pmap_max_wass_dist', 'mean_rms', 'max_rms']
356                         and np.isinf(best_alpha_val) and j == reg_values.size - 1):
357                    if run_opts.metric == 'mss_var':
358                        best_alpha_val = np.mean(stable_frac[:, :, i, j, k])
359                    elif run_opts.metric == 'valid_time':
360                        best_alpha_val = np.median(valid_time[:, :, :, i, :, j, k])
361                    elif run_opts.metric == 'pmap_max_wass_dist':
362                        best_alpha_val = np.mean(
363                            run_opts.pmap_max_wass_dist[:, :, :, i, j, k])
364                    elif run_opts.metric == 'mean_rms':
365                        best_alpha_val = np.mean(mean_rms[:, :, :, i, j, k])
366                    elif run_opts.metric == 'max_rms':
367                        best_alpha_val = np.median(max_rms[:, :, :, i, j, k])
368                    best_stable_frac[:, :, :, i, k] = -stable_frac[:, :, i, j, k]
369                    best_train_mean_rms[:, :, :, i, k] = train_mean_rms[:, :, i, j, k]
370                    best_train_max_rms[:, :, :, i, k] = train_max_rms[:, :, i, j, k]
371                    best_variances[:, :, :, i, k] = variances[:, :, :, i, j, k]
372                    best_mean_rms[:, :, :, i, k] = mean_rms[:, :, :, i, j, k]
373                    best_max_rms[:, :, :, i, k] = max_rms[:, :, :, i, j, k]
374                    best_valid_time[:, :, :, i, :, k] = valid_time[:, :, :, i, :, j, k]
375                    best_pmap_max_wass_dist[:, i] = pmap_max_wass_dist[:, :, :, i, j, k]
376                    stable_frac_alpha[i, k] = reg_values[j]
377                    best_j[i, k] = int(j)
378                    if run_opts.save_time_rms:
379                        # best_mean_all[:,:,:,i] = mean_all[:,:,:,i,:,j]
380                        # best_variances_all[:,:,:,i] = variances_all[:,:,:,i,:,j]
381                        best_rms[:, :, :, i, :, k] = rms[:, :, :, i, :, j, k]
382        all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx = np.meshgrid(
383            np.arange(run_opts.res_per_test, dtype=int),
384            np.arange(train_vals.size, dtype=int), np.arange(noise_vals.size, dtype=int),
385            np.arange(run_opts.num_tests, dtype=int), np.arange(reg_train_times.size, dtype=int))
386
387        all_res_idx = all_res_idx.flatten()
388        all_train_idx = all_train_idx.flatten()
389        all_test_idx = all_test_idx.flatten()
390        all_noise_idx = all_noise_idx.flatten()
391        all_reg_train_idx = all_reg_train_idx.flatten()
392        all_res = res_vals[all_res_idx]
393        all_train = train_vals[all_train_idx]
394        all_test = test_vals[all_test_idx]
395        all_noise = noise_vals[all_noise_idx]
396        all_reg = reg_values[best_j[all_noise_idx, all_reg_train_idx]]
397        all_reg_train = reg_train_times[all_reg_train_idx]
398
399        data_dict = {'res': all_res,
400                     'train': all_train,
401                     'test': all_test,
402                     'noise': all_noise,
403                     'reg': all_reg,
404                     'reg_train': all_reg_train}
405
406        data_out = pd.DataFrame(data_dict)
407        data_out['stable_frac'] = best_stable_frac[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx]
408        data_out['train_mean_rms'] = best_train_mean_rms[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx]
409        data_out['train_max_rms'] = best_train_max_rms[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx]
410        data_out['mean_rms'] = best_mean_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx]
411        data_out['max_rms'] = best_max_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx]
412        data_out['variance'] = best_variances[
413            all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx]
414        data_out = pd.concat([data_out, pd.DataFrame(
415            best_valid_time[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_train_idx],
416            columns=['valid_time%d' % i for i in range(num_vt_tests)])], axis=1)
417
418        if run_opts.pmap:
419            data_out['pmap_max_wass_dist'] = best_pmap_max_wass_dist[
420                all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx]
421        if run_opts.save_time_rms:
422            # data_out = pd.concat([data_out, pd.DataFrame(best_mean_all[all_res, all_train_idx, all_test, all_noise_idx],\
423            #    columns = ['mean_all%d' % i for i in range((rkTime-split)+1)])], axis = 1)
424            # print('Concatenated mean_all')
425            # data_out = pd.concat([data_out, pd.DataFrame(best_variances_all[all_res, all_train_idx, all_test, all_noise_idx],\
426            #         columns = ['variances_all%d' % i for i in range((rkTime-split)+1)])], axis = 1)
427            # print('Concatenated variances_all')
428            data_out = pd.concat([data_out, pd.DataFrame(
429                best_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_train_idx],
430                columns=['rms%d' % i for i in range((rkTime - split))])], axis=1)
431            print('Concatenated rms')
432
433        print('Compressing and saving data...')
434        save_tic = time.perf_counter()
435        data_out.to_csv(run_opts.run_file_name)
436        save_toc = time.perf_counter()
437        print('Time to compress and save data: %0.2f sec.' % ((save_toc - save_tic)))
438
439    comp_data_size = os.stat(run_opts.run_file_name).st_size
440
441    if run_opts.savepred:
442        if run_opts.return_all:
443            pred_files = [get_filename(run_opts.run_folder_name, 'pred', res, train, noise, reg_train_time, reg=reg,
444                                       test_idx=test, just_file=True) \
445                          for res, train, test, noise, reg_train_time, reg in
446                          zip(data_out['res'], data_out['train'], data_out['test'],
447                              data_out['noise'], data_out['reg_train'], data_out['reg'])]
448        else:
449            noise_vals_set_idx, reg_train_times_set_idx = np.meshgrid(np.arange(noise_vals.size, dtype=int),
450                                                                      np.arange(reg_train_times.size, dtype=int))
451            noise_vals_set_idx = noise_vals_set_idx.flatten()
452            reg_train_times_set_idx = reg_train_times_set_idx.flatten()
453            noise_vals_set = noise_vals[noise_vals_set_idx]
454            reg_train_times_set = reg_train_times[reg_train_times_set_idx]
455            pred_files = ['pred_res%d_train%d_test%d_noise%e_regtrain%d_reg%e.csv' % (
456                res, train, test, noise, reg_train_time, reg) \
457                          for res, train, test, (noise, reg_train_time, reg) in
458                          product(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test),
459                                  np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains),
460                                  np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests),
461                                  zip(noise_vals_set, reg_train_times_set,
462                                      reg_values[best_j[noise_vals_set_idx, reg_train_times_set_idx]]))]
463        #print('Pred file names')
464        #for file in pred_files:
465        #    print(file)
466        all_files = os.listdir(run_opts.run_folder_name)
467        for file in all_files:
468            if file not in pred_files and 'true_test' not in file:
469                os.remove(os.path.join(run_opts.run_folder_name, file))
470        pred_data_size = 0
471        for ele in os.scandir(run_opts.run_folder_name):
472            pred_data_size += os.stat(ele).st_size
473        raw_data_size += pred_data_size
474        comp_data_size += pred_data_size
475    else:
476        all_files = os.listdir(run_opts.run_folder_name)
477        if run_opts.pmap:
478            for file in all_files:
479                if os.path.isfile(os.path.join(run_opts.run_folder_name,
480                                               file)) and 'pmap_max_res' not in file and 'true_test' not in file:
481                    os.remove(os.path.join(run_opts.run_folder_name, file))
482        else:
483            for file in all_files:
484                if os.path.isfile(os.path.join(run_opts.run_folder_name, file)) and 'true_test' not in file:
485                    os.remove(os.path.join(run_opts.run_folder_name, file))
486        if len(os.listdir(run_opts.run_folder_name)) == 0:
487            os.rmdir(run_opts.run_folder_name)
488    print('Compressed data size: %0.3f kB' % (comp_data_size / 1000))
489    print('Data compressed by %0.3f percent' % ((1. - comp_data_size / float(raw_data_size)) * 100))
490    toc = time.perf_counter()
491    print('Compressed Results Saved in %f sec.' % (toc - tic))
492
493
494def main(argv):
495    process_data(argv)
496
497
498if __name__ == "__main__":
499    main(sys.argv[1:])
def process_data(argv=None, run_opts=None):
 27def process_data(argv=None, run_opts=None):
 28    saved_flag = False
 29
 30    tic = time.perf_counter()
 31    if not isinstance(argv, type(None)) and isinstance(run_opts, type(None)):
 32        run_opts = RunOpts(argv)
 33
 34    raw_data_size = 0
 35    for ele in os.scandir(run_opts.run_folder_name):
 36        raw_data_size += os.stat(ele).st_size
 37    print('Raw data size: %0.2f kB' % (raw_data_size / 1000.))
 38    noise_vals = run_opts.noise_values_array
 39    reg_values = run_opts.reg_values
 40    reg_train_times = run_opts.reg_train_times
 41
 42    #print("Regularization training times:")
 43    #print(reg_train_times)
 44    #print(type(reg_train_times))
 45    #print(reg_train_times.shape)
 46
 47    rkTime = run_opts.test_time
 48    split = run_opts.sync_time
 49    num_vt_tests = (rkTime - split) // run_opts.max_valid_time
 50
 51    # def get_stability_output(out_full, data_path, filename, noise_indices, train_indices, res_per_test, run_opts.num_tests, reg_values, savepred, save_time_rms, run_opts.pmap, rkTime, split, metric = 'mss_var', return_all = False):#metric='pmap_max_wass_dist'):
 52    # Function to process the output from all of the different reservoirs, trainning data sets, and noise values tested by find_stability.
 53    # If return_all is True, this simply unpacks the linear output from the find_stability loop.
 54    # If not, then this function returns only the results using the most optimal regularization (as defined by the metric) and using no regulariation.
 55    res_vals = np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test)
 56    train_vals = np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains)
 57    test_vals = np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)
 58    #print('Trains:')
 59    #print(train_vals)
 60    #print('Noise:')
 61    #print(noise_vals)
 62    #print('Res')
 63    #print(run_opts.res_per_test)
 64    #print('Regs:')
 65    #print(reg_values)
 66
 67    stable_frac = np.zeros(
 68        (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size))
 69    train_mean_rms = np.zeros(
 70        (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size))
 71    train_max_rms = np.zeros(
 72        (run_opts.res_per_test, train_vals.size, noise_vals.size, reg_values.size, reg_train_times.size))
 73    mean_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size,
 74                         reg_train_times.size))
 75    max_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size,
 76                        reg_train_times.size))
 77    variances = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_values.size,
 78                          reg_train_times.size))
 79    valid_time = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, num_vt_tests,
 80                           reg_values.size, reg_train_times.size))
 81    if run_opts.save_time_rms:
 82        # mean_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1, reg_values.size))
 83        # variances_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1, reg_values.size))
 84        rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime - split),
 85                        reg_values.size, reg_train_times.size))
 86    if run_opts.pmap:
 87        pmap_max_wass_dist = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size,
 88                                       reg_values.size, reg_train_times.size))
 89    if run_opts.save_eigenvals:
 90        eigenvals_in = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size),
 91                                dtype=object)
 92
 93    #print(np.arange(run_opts.res_per_test, dtype=int))
 94    #print(np.arange(run_opts.num_trains, dtype=int))
 95    #print(list(enumerate(noise_vals)))
 96    #print(list(enumerate(reg_train_times)))
 97    print('Loading in raw data...')
 98    load_tic = time.perf_counter()
 99    if os.name == 'nt':
100        for (i, res), (j, train), (k, noise), (l, reg_train_time) in product(
101                enumerate(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test, dtype=int)),
102                enumerate(np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains, dtype=int)),
103                enumerate(noise_vals), enumerate(reg_train_times)):
104            stable_frac[i, j, k, :, l] = np.loadtxt(
105                get_windows_path(
106                    get_filename(run_opts.run_folder_name, 'stable_frac', res, train, noise, reg_train_time)),
107                delimiter=',')
108            train_mean_rms[i, j, k, :, l] = np.loadtxt(
109                get_windows_path(
110                    get_filename(run_opts.run_folder_name, 'train_mean_rms', res, train, noise, reg_train_time)),
111                delimiter=',')
112            train_max_rms[i, j, k, :, l] = np.loadtxt(
113                get_windows_path(
114                    get_filename(run_opts.run_folder_name, 'train_max_rms', res, train, noise, reg_train_time)),
115                delimiter=',')
116            mean_rms[i, j, :, k, :, l] = np.transpose(
117                np.loadtxt(get_windows_path(
118                    get_filename(run_opts.run_folder_name, 'mean_rms', res, train, noise, reg_train_time)),
119                           delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
120            max_rms[i, j, :, k, :, l] = np.transpose(
121                np.loadtxt(get_windows_path(
122                    get_filename(run_opts.run_folder_name, 'max_rms', res, train, noise, reg_train_time)),
123                           delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
124            variances[i, j, :, k, :, l] = np.transpose(
125                np.loadtxt(get_windows_path(
126                    get_filename(run_opts.run_folder_name, 'variance', res, train, noise, reg_train_time)),
127                           delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
128            for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)):
129                valid_time[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(get_windows_path(
130                    get_filename(run_opts.run_folder_name, 'valid_time', res, train, noise, reg_train_time,
131                                 test_idx=test)),
132                    delimiter=',')).reshape(
133                    (num_vt_tests, reg_values.size))
134            if run_opts.pmap:
135                pmap_max_wass_dist[i, j, :, k, :, l] = np.transpose(np.loadtxt(
136                    get_windows_path(get_filename(run_opts.run_folder_name, 'pmap_max_wass_dist', res, train, noise,
137                                                  reg_train_time)),
138                    delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
139            if run_opts.save_eigenvals:
140                eigenvals_in[i, j, k, l] = np.loadtxt(
141                    get_windows_path(
142                        get_filename(run_opts.run_folder_name, 'gradreg_eigenvals', res, train, noise, reg_train_time)),
143                    delimiter=',')
144            if run_opts.save_time_rms:
145                for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)):
146                    rms[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(get_windows_path(
147                        get_filename(run_opts.run_folder_name, 'rms', res, train, noise, reg_train_time,
148                                     test_idx=test)),
149                        delimiter=',')).reshape(
150                        ((rkTime - split), reg_values.size))
151        if run_opts.save_eigenvals:
152            eigenvals = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size,
153                                  eigenvals_in[0, 0, 0, 0].size))
154            for i, j, k, l in product(np.arange(run_opts.res_per_test, dtype=int),
155                                      np.arange(train_vals.size, dtype=int), np.arange(noise_vals.size, dtype=int),
156                                      np.arange(reg_train_times.size, dtype=int)):
157                eigenvals[i, j, k, l] = eigenvals_in[i, j, k, l]
158            #print('Eigenvals shape:')
159            #print(eigenvals.shape)
160    else:
161        for (i, res), (j, train), (k, noise), (l, reg_train_time) in product(
162                enumerate(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test, dtype=int)),
163                enumerate(np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains, dtype=int)),
164                enumerate(noise_vals), enumerate(reg_train_times)):
165            stable_frac[i, j, k, :, l] = np.loadtxt(
166                get_filename(run_opts.run_folder_name, 'stable_frac', res, train, noise, reg_train_time),
167                delimiter=',')
168            train_mean_rms[i, j, k, :, l] = np.loadtxt(
169                get_filename(run_opts.run_folder_name, 'train_mean_rms', res, train, noise, reg_train_time),
170                delimiter=',')
171            train_max_rms[i, j, k, :, l] = np.loadtxt(
172                get_filename(run_opts.run_folder_name, 'train_max_rms', res, train, noise, reg_train_time),
173                delimiter=',')
174            mean_rms[i, j, :, k, :, l] = np.transpose(
175                np.loadtxt(get_filename(run_opts.run_folder_name, 'mean_rms', res, train, noise, reg_train_time),
176                delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
177            max_rms[i, j, :, k, :, l] = np.transpose(
178                np.loadtxt(get_filename(run_opts.run_folder_name, 'max_rms', res, train, noise, reg_train_time),
179                delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
180            variances[i, j, :, k, :, l] = np.transpose(
181                np.loadtxt(get_filename(run_opts.run_folder_name, 'variance', res, train, noise, reg_train_time),
182                delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
183            for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)):
184                valid_time[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(
185                    get_filename(run_opts.run_folder_name, 'valid_time', res, train, noise, reg_train_time,
186                                 test_idx=test),
187                    delimiter=',')).reshape(
188                    (num_vt_tests, reg_values.size))
189            if run_opts.pmap:
190                pmap_max_wass_dist[i, j, :, k, :, l] = np.transpose(np.loadtxt(
191                    get_filename(run_opts.run_folder_name, 'pmap_max_wass_dist', res, train, noise, reg_train_time),
192                    delimiter=',')).reshape((run_opts.num_tests, reg_values.size))
193            if run_opts.save_eigenvals:
194                eigenvals_in[i, j, k, l] = np.loadtxt(
195                    get_filename(run_opts.run_folder_name, 'gradreg_eigenvals', res, train, noise, reg_train_time),
196                    delimiter=',')
197            if run_opts.save_time_rms:
198                for (m, test) in enumerate(np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests)):
199                    rms[i, j, m, k, :, :, l] = np.transpose(np.loadtxt(
200                        get_filename(run_opts.run_folder_name, 'rms', res, train, noise, reg_train_time, test_idx=test),
201                        delimiter=',')).reshape(
202                        ((rkTime - split), reg_values.size))
203
204    if run_opts.save_eigenvals:
205        eigenvals = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size,
206                              eigenvals_in[0, 0, 0, 0].size))
207        for i, j, k, l in product(np.arange(run_opts.res_per_test, dtype=int), np.arange(train_vals.size, dtype=int),
208                                  np.arange(noise_vals.size, dtype=int), np.arange(reg_train_times.size, dtype=int)):
209            eigenvals[i, j, k, l] = eigenvals_in[i, j, k, l]
210        #print('Eigenvals shape:')
211        #print(eigenvals.shape)
212    load_toc = time.perf_counter()
213    print('All data loaded in %0.2f sec.' % (load_toc - load_tic))
214
215    if run_opts.return_all:
216
217        save_filename = run_opts.run_file_name
218        if os.path.exists(save_filename):
219            saved_flag = True
220            print('Found data file with the same name. Loading...')
221            save_tic = time.perf_counter()
222            saved_data = pd.read_csv(save_filename, index_col=0)
223            save_toc = time.perf_counter()
224            print('Saved data loaded in %0.2f sec.' % (save_toc - save_tic))
225
226        all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx = np.meshgrid(
227            np.arange(run_opts.res_per_test, dtype=int), np.arange(train_vals.size, dtype=int),
228            np.arange(run_opts.num_tests, dtype=int), np.arange(noise_vals.size, dtype=int),
229            np.arange(reg_values.size, dtype=int), np.arange(reg_train_times.size, dtype=int))
230
231        all_res_idx = all_res_idx.flatten()
232        all_train_idx = all_train_idx.flatten()
233        all_test_idx = all_test_idx.flatten()
234        all_noise_idx = all_noise_idx.flatten()
235        all_reg_idx = all_reg_idx.flatten()
236        all_reg_train_idx = all_reg_train_idx.flatten()
237        all_res = res_vals[all_res_idx]
238        all_train = train_vals[all_train_idx]
239        all_test = test_vals[all_test_idx]
240        all_noise = noise_vals[all_noise_idx]
241        all_reg = reg_values[all_reg_idx]
242        all_reg_train = reg_train_times[all_reg_train_idx]
243
244        data_dict = {'res': all_res,
245                     'train': all_train,
246                     'test': all_test,
247                     'noise': all_noise,
248                     'reg': all_reg,
249                     'reg_train': all_reg_train}
250
251        data_out = pd.DataFrame(data_dict)
252        data_out['stable_frac'] = stable_frac[all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
253        data_out['train_mean_rms'] = train_mean_rms[
254            all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
255        data_out['train_max_rms'] = train_max_rms[
256            all_res_idx, all_train_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
257        data_out['mean_rms'] = mean_rms[
258            all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
259        data_out['max_rms'] = max_rms[
260            all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
261        data_out['variance'] = variances[
262            all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
263        data_out = pd.concat([data_out, pd.DataFrame(
264            valid_time[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_idx, all_reg_train_idx],
265            columns=['valid_time%d' % i for i in range(num_vt_tests)])], axis=1)
266
267        if run_opts.pmap:
268            data_out['pmap_max_wass_dist'] = pmap_max_wass_dist[
269                all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_idx, all_reg_train_idx]
270        if run_opts.save_eigenvals:
271            #print(data_out[all_test_idx == 0].shape)
272            #print(data_out[all_test_idx == 0][['res', 'train', 'test', 'noise', 'reg', 'reg_train']])
273            #print(eigenvals[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx].shape)
274            #print(['eig%d' % (i + 1) for i in range(eigenvals.shape[-1])])
275            eigenval_idx = (all_test_idx == 0) & (all_reg_idx == 0)
276            data_out.at[eigenval_idx, ['eig%d' % (i + 1) for i in range(eigenvals.shape[-1])]] = \
277                eigenvals[all_res_idx[eigenval_idx], all_train_idx[eigenval_idx], all_noise_idx[eigenval_idx], \
278                          all_reg_train_idx[eigenval_idx]]
279        if run_opts.save_time_rms:
280            # data_out = pd.concat([data_out, pd.DataFrame(mean_all[all_res, all_train_idx, all_test, all_noise_idx, :, all_reg_idx],\
281            #        columns = ['mean_all%d' % i for i in range((rkTime-split)+1)])], axis = 1)
282            # print('Concatenated mean_all')
283            # data_out = pd.concat([data_out, pd.DataFrame(variances_all[all_res, all_train_idx, all_test, all_noise_idx, :, all_reg_idx],\
284            #        columns = ['variances_all%d' % i for i in range((rkTime-split)+1)])], axis = 1)
285            # print('Concatendated variances_all')
286            data_out = pd.concat([data_out, pd.DataFrame(
287                rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_idx, all_reg_train_idx],
288                columns=['rms%d' % (i + 1) for i in range((rkTime - split))])], axis=1)
289            #print('Concatenated rms')
290
291        if saved_flag:
292            saved_cols = saved_data.columns.to_list()
293            if set(saved_cols) != set(data_out.columns.to_list()):
294                print('Saved Data set of the same name does not contain the same type of data.')
295                print('Delete this file before running this code again.')
296                raise ValueError
297            data_out = pd.concat([data_out, saved_data], copy=False)
298            data_out.drop_duplicates(['res', 'train', 'test', 'noise', 'reg', 'reg_train'], inplace=True)
299            sort_tic = time.perf_counter()
300            data_out.sort_values(['res', 'train', 'test', 'noise', 'reg', 'reg_train'], inplace=True, ignore_index=True)
301            sort_toc = time.perf_counter()
302            print('Data sorted in %0.2f sec.' % (sort_toc - sort_tic))
303            raw_data_size = float(data_out.memory_usage().sum())
304
305        print('Compressing and saving data...')
306        save_tic = time.perf_counter()
307        data_out.to_csv(save_filename)
308        save_toc = time.perf_counter()
309        print('Time to compress and save data: %0.2f sec.' % ((save_toc - save_tic)))
310
311    # elif return_all and savepred:
312    #    raise ValueError
313    else:
314        best_stable_frac = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size))
315        best_train_mean_rms = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size))
316        best_train_max_rms = np.zeros((run_opts.res_per_test, train_vals.size, noise_vals.size, reg_train_times.size))
317        best_mean_rms = np.zeros(
318            (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size))
319        best_max_rms = np.zeros(
320            (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size))
321        best_variances = np.zeros(
322            (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size))
323        best_valid_time = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size,
324                                    num_vt_tests, reg_train_times.size))
325        if run_opts.save_time_rms:
326            # best_mean_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1))
327            # best_variances_all = np.zeros((res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, (rkTime-split)+1))
328            best_rms = np.zeros((run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size,
329                                 (rkTime - split), reg_train_times.size))
330        best_pmap_max_wass_dist = np.zeros(
331            (run_opts.res_per_test, train_vals.size, run_opts.num_tests, noise_vals.size, reg_train_times.size))
332        stable_frac_alpha = np.zeros(noise_vals.size, reg_train_times.size)
333        best_j = np.zeros(noise_vals.size, reg_train_times.size)
334        for (i, noise), (k, reg_train_time) in product(enumerate(noise_vals), enumerate(reg_train_times)):
335            if run_opts.metric in ['mss_var', 'valid_time']:
336                best_alpha_val = 0
337            elif run_opts.metric in ['pmap_max_wass_dist', 'mean_rms', 'max_rms']:
338                best_alpha_val = np.inf
339            for j in range(reg_values.size):
340                if run_opts.metric == 'mss_var':
341                    metric_flag = np.mean(
342                        stable_frac[:, :, i, j, k]) > best_alpha_val
343                elif run_opts.metric == 'valid_time':
344                    metric_flag = np.median(valid_time[:, :, :, i, :, j, k]) > best_alpha_val
345                elif run_opts.metric == 'pmap_max_wass_dist':
346                    # print(j)
347                    # print(np.mean(run_opts.pmap_max_wass_dist[:, i, :, :, j-1]))
348                    metric_flag = np.mean(
349                        run_opts.pmap_max_wass_dist[:, :, :, i, j, k]) <= best_alpha_val
350                elif run_opts.metric == 'mean_rms':
351                    metric_flag = np.mean(mean_rms[:, :, :, i, j, k]) <= best_alpha_val
352                elif run_opts.metric == 'max_rms':
353                    metric_flag = np.median(max_rms[:, :, :, i, j, k]) <= best_alpha_val
354                if metric_flag or (run_opts.metric in ['mss_var',
355                                                       'valid_time'] and best_alpha_val == 0 and j == reg_values.size - 1) or \
356                        (run_opts.metric in ['pmap_max_wass_dist', 'mean_rms', 'max_rms']
357                         and np.isinf(best_alpha_val) and j == reg_values.size - 1):
358                    if run_opts.metric == 'mss_var':
359                        best_alpha_val = np.mean(stable_frac[:, :, i, j, k])
360                    elif run_opts.metric == 'valid_time':
361                        best_alpha_val = np.median(valid_time[:, :, :, i, :, j, k])
362                    elif run_opts.metric == 'pmap_max_wass_dist':
363                        best_alpha_val = np.mean(
364                            run_opts.pmap_max_wass_dist[:, :, :, i, j, k])
365                    elif run_opts.metric == 'mean_rms':
366                        best_alpha_val = np.mean(mean_rms[:, :, :, i, j, k])
367                    elif run_opts.metric == 'max_rms':
368                        best_alpha_val = np.median(max_rms[:, :, :, i, j, k])
369                    best_stable_frac[:, :, :, i, k] = -stable_frac[:, :, i, j, k]
370                    best_train_mean_rms[:, :, :, i, k] = train_mean_rms[:, :, i, j, k]
371                    best_train_max_rms[:, :, :, i, k] = train_max_rms[:, :, i, j, k]
372                    best_variances[:, :, :, i, k] = variances[:, :, :, i, j, k]
373                    best_mean_rms[:, :, :, i, k] = mean_rms[:, :, :, i, j, k]
374                    best_max_rms[:, :, :, i, k] = max_rms[:, :, :, i, j, k]
375                    best_valid_time[:, :, :, i, :, k] = valid_time[:, :, :, i, :, j, k]
376                    best_pmap_max_wass_dist[:, i] = pmap_max_wass_dist[:, :, :, i, j, k]
377                    stable_frac_alpha[i, k] = reg_values[j]
378                    best_j[i, k] = int(j)
379                    if run_opts.save_time_rms:
380                        # best_mean_all[:,:,:,i] = mean_all[:,:,:,i,:,j]
381                        # best_variances_all[:,:,:,i] = variances_all[:,:,:,i,:,j]
382                        best_rms[:, :, :, i, :, k] = rms[:, :, :, i, :, j, k]
383        all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx = np.meshgrid(
384            np.arange(run_opts.res_per_test, dtype=int),
385            np.arange(train_vals.size, dtype=int), np.arange(noise_vals.size, dtype=int),
386            np.arange(run_opts.num_tests, dtype=int), np.arange(reg_train_times.size, dtype=int))
387
388        all_res_idx = all_res_idx.flatten()
389        all_train_idx = all_train_idx.flatten()
390        all_test_idx = all_test_idx.flatten()
391        all_noise_idx = all_noise_idx.flatten()
392        all_reg_train_idx = all_reg_train_idx.flatten()
393        all_res = res_vals[all_res_idx]
394        all_train = train_vals[all_train_idx]
395        all_test = test_vals[all_test_idx]
396        all_noise = noise_vals[all_noise_idx]
397        all_reg = reg_values[best_j[all_noise_idx, all_reg_train_idx]]
398        all_reg_train = reg_train_times[all_reg_train_idx]
399
400        data_dict = {'res': all_res,
401                     'train': all_train,
402                     'test': all_test,
403                     'noise': all_noise,
404                     'reg': all_reg,
405                     'reg_train': all_reg_train}
406
407        data_out = pd.DataFrame(data_dict)
408        data_out['stable_frac'] = best_stable_frac[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx]
409        data_out['train_mean_rms'] = best_train_mean_rms[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx]
410        data_out['train_max_rms'] = best_train_max_rms[all_res_idx, all_train_idx, all_noise_idx, all_reg_train_idx]
411        data_out['mean_rms'] = best_mean_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx]
412        data_out['max_rms'] = best_max_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx]
413        data_out['variance'] = best_variances[
414            all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx]
415        data_out = pd.concat([data_out, pd.DataFrame(
416            best_valid_time[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_train_idx],
417            columns=['valid_time%d' % i for i in range(num_vt_tests)])], axis=1)
418
419        if run_opts.pmap:
420            data_out['pmap_max_wass_dist'] = best_pmap_max_wass_dist[
421                all_res_idx, all_train_idx, all_test_idx, all_noise_idx, all_reg_train_idx]
422        if run_opts.save_time_rms:
423            # data_out = pd.concat([data_out, pd.DataFrame(best_mean_all[all_res, all_train_idx, all_test, all_noise_idx],\
424            #    columns = ['mean_all%d' % i for i in range((rkTime-split)+1)])], axis = 1)
425            # print('Concatenated mean_all')
426            # data_out = pd.concat([data_out, pd.DataFrame(best_variances_all[all_res, all_train_idx, all_test, all_noise_idx],\
427            #         columns = ['variances_all%d' % i for i in range((rkTime-split)+1)])], axis = 1)
428            # print('Concatenated variances_all')
429            data_out = pd.concat([data_out, pd.DataFrame(
430                best_rms[all_res_idx, all_train_idx, all_test_idx, all_noise_idx, :, all_reg_train_idx],
431                columns=['rms%d' % i for i in range((rkTime - split))])], axis=1)
432            print('Concatenated rms')
433
434        print('Compressing and saving data...')
435        save_tic = time.perf_counter()
436        data_out.to_csv(run_opts.run_file_name)
437        save_toc = time.perf_counter()
438        print('Time to compress and save data: %0.2f sec.' % ((save_toc - save_tic)))
439
440    comp_data_size = os.stat(run_opts.run_file_name).st_size
441
442    if run_opts.savepred:
443        if run_opts.return_all:
444            pred_files = [get_filename(run_opts.run_folder_name, 'pred', res, train, noise, reg_train_time, reg=reg,
445                                       test_idx=test, just_file=True) \
446                          for res, train, test, noise, reg_train_time, reg in
447                          zip(data_out['res'], data_out['train'], data_out['test'],
448                              data_out['noise'], data_out['reg_train'], data_out['reg'])]
449        else:
450            noise_vals_set_idx, reg_train_times_set_idx = np.meshgrid(np.arange(noise_vals.size, dtype=int),
451                                                                      np.arange(reg_train_times.size, dtype=int))
452            noise_vals_set_idx = noise_vals_set_idx.flatten()
453            reg_train_times_set_idx = reg_train_times_set_idx.flatten()
454            noise_vals_set = noise_vals[noise_vals_set_idx]
455            reg_train_times_set = reg_train_times[reg_train_times_set_idx]
456            pred_files = ['pred_res%d_train%d_test%d_noise%e_regtrain%d_reg%e.csv' % (
457                res, train, test, noise, reg_train_time, reg) \
458                          for res, train, test, (noise, reg_train_time, reg) in
459                          product(np.arange(run_opts.res_start, run_opts.res_start + run_opts.res_per_test),
460                                  np.arange(run_opts.train_start, run_opts.train_start + run_opts.num_trains),
461                                  np.arange(run_opts.test_start, run_opts.test_start + run_opts.num_tests),
462                                  zip(noise_vals_set, reg_train_times_set,
463                                      reg_values[best_j[noise_vals_set_idx, reg_train_times_set_idx]]))]
464        #print('Pred file names')
465        #for file in pred_files:
466        #    print(file)
467        all_files = os.listdir(run_opts.run_folder_name)
468        for file in all_files:
469            if file not in pred_files and 'true_test' not in file:
470                os.remove(os.path.join(run_opts.run_folder_name, file))
471        pred_data_size = 0
472        for ele in os.scandir(run_opts.run_folder_name):
473            pred_data_size += os.stat(ele).st_size
474        raw_data_size += pred_data_size
475        comp_data_size += pred_data_size
476    else:
477        all_files = os.listdir(run_opts.run_folder_name)
478        if run_opts.pmap:
479            for file in all_files:
480                if os.path.isfile(os.path.join(run_opts.run_folder_name,
481                                               file)) and 'pmap_max_res' not in file and 'true_test' not in file:
482                    os.remove(os.path.join(run_opts.run_folder_name, file))
483        else:
484            for file in all_files:
485                if os.path.isfile(os.path.join(run_opts.run_folder_name, file)) and 'true_test' not in file:
486                    os.remove(os.path.join(run_opts.run_folder_name, file))
487        if len(os.listdir(run_opts.run_folder_name)) == 0:
488            os.rmdir(run_opts.run_folder_name)
489    print('Compressed data size: %0.3f kB' % (comp_data_size / 1000))
490    print('Data compressed by %0.3f percent' % ((1. - comp_data_size / float(raw_data_size)) * 100))
491    toc = time.perf_counter()
492    print('Compressed Results Saved in %f sec.' % (toc - tic))
def main(argv):
495def main(argv):
496    process_data(argv)