From 345ab117129718b9c6919db73a56534e29a77b11 Mon Sep 17 00:00:00 2001 From: xychem Date: Mon, 12 Aug 2024 15:38:06 -0400 Subject: [PATCH 1/5] Add script for sampling 4-well potential from #176 --- notebooks/evil_example.py | 282 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 notebooks/evil_example.py diff --git a/notebooks/evil_example.py b/notebooks/evil_example.py new file mode 100644 index 00000000..b4cda296 --- /dev/null +++ b/notebooks/evil_example.py @@ -0,0 +1,282 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.gaussian_process import GaussianProcessRegressor +import pandas as pd +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import ConstantKernel, RBF +from sklearn.metrics import mean_absolute_error,mean_squared_error,pairwise_distances,max_error +from DiverseSelector.methods.dissimilarity import MaxMin, OptiSim + + +# define potential energy function +def potential_energy(q1,q2): + ''' + q1: float or np.array + + q2: float or np.array + + ''' + v_0 = 5.0 + a_0 = 0.6 + a = [3.0,1.5,3.2,2.0] + b_q1 = 0.1 + b_q2 = 0.1 + sigma_q1 = [0.3,1.0,0.4,1.0] + sigma_q2 = [0.4,1.0,1.0,0.1] + alpha = [1.3,-1.5,1.4,-1.3] + beta = [-1.6,-1.7,1.8,1.23] + + v = v_0 + a_0*(np.exp(-(q1-b_q1)**2-(q2-b_q2)**2)) + + for i in range(4): + v = v - a[i]*np.exp(-sigma_q1[i]*(q1-alpha[i])**2-sigma_q2[i]*(q2-beta[i])**2) + + return v + + +# define boltzmann method +def boltzmann_sample(database,k): + ''' + database : np.array + the database we want to sample + + k : np.array or int, float + when k is larger, the database is more unbiased + the order of 1.5 + ''' + + E_0 = 1.780 # the minimum energy + + # if k == int + if type(k) == int or type(k) == float: + sample_list = [] + + rng = np.random.default_rng() + rn = rng.random((len(database),)) + p = np.exp(-np.array(k)*(1.5)*((potential_energy(database[:,0],database[:,1]))-E_0)) + for i in range(len(database)): + if p[i] >= rn[i]: + sample_list.append(database[i]) + + sample_list = np.array(sample_list) + + return sample_list + + # if k == np.array + sample_list = {} + + for i in range(len(k)): + sample_list[str(k[i])] = [] + + for j in range(len(k)): # if all of the boltzmann sampling databases have the sample_number points then break out + rng = np.random.default_rng() # define a random data generate tool + rn = rng.random((len(database),1)) # generate len(k) random numbers + p = np.exp(-np.array(k[j])*(1.5)*((potential_energy(database[:,0],database[:,1]))-E_0)) # calculate the probability of the point from -3 to 3 + for i in range(len(database)): + if p[i] >= rn[i]: # if p >= random_number than choose the point + sample_list[str(k[j])].append(database[i]) + + for i in range(len(k)): + sample_list[str(k[i])]=np.array(sample_list[str(k[i])]) + + return sample_list + + +# define random sample method +def random_sample(data,sample_number,sample_times): + sample_list=[] + for sample_time in range(sample_times): + label = np.random.choice(np.arange(len(data)),sample_number,replace=False) + sample_list.append(data[label]) + + sample_list = np.array(sample_list) + return sample_list + + +def generate_data(number): + q1 = np.linspace(-3,3,number) + q2 = np.linspace(-3,3,number) + + data = [] + for i in range(len(q1)): + for j in range(len(q2)): + data.append([q1[i],q2[j]]) + + data = np.array(data) + + return data + + +# define a class of error calculator +class ErrorCal(): + ''' + ---------- + features: np.ndarray + Indices of points' number and features + row of features represents the number of points + column of features represents the number of points' features + + values: np.ndarray + values of each points + + sample_number: int + The maximum number of selected points + ''' + def __init__(self,features,values,sample_number,function=None,labels=None): + self.features = features + self.values = values + self.sample_number = sample_number + self.function = function + self.labels = labels + + + # define random_error calculator + def random_error(self): + # choose points with random sampling + points_number = np.arange(len(self.features)) #the total points number + label = np.random.choice(points_number,self.sample_number,replace=False) #random choose the label of points + features_choice = self.features[label] #the selected points'features + values_choice = self.values[label] #the selected points'values + + # fit with Gaussian Process + kernel = ConstantKernel(constant_value=0.2) + RBF(length_scale=0.5, length_scale_bounds=(1e-7, 1e+7)) + gpr = GaussianProcessRegressor(n_restarts_optimizer=25,kernel=kernel) + model = gpr.fit(features_choice,values_choice) + + # test the error by regular grid method + test_points = generate_data(100) + + # calculate the maximum error + random_m_error = max_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) + # calculate the mean squared error + random_ms_error = mean_squared_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) + # calculate the mean absolute error + random_ma_error = mean_absolute_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) + + return random_m_error,random_ms_error,random_ma_error + + + # define maxmin_error calculator + def maxmin_error(self): + # choose points with maxmin sampling + if self.function != None: + features_dist = self.function(self.features) + else: + features_dist = pairwise_distances(X=self.features,Y=None,metric="euclidean") + + selector = MaxMin() + selected_ids = selector.select(arr=features_dist,size=self.sample_number,labels=self.labels) + features_choice = self.features[selected_ids] + values_choice = self.values[selected_ids] + + # fit with Gaussian Process + kernel = ConstantKernel(constant_value=0.2) + RBF(length_scale=0.5, length_scale_bounds=(1e-7, 1e+7)) + gpr = GaussianProcessRegressor(n_restarts_optimizer=25,kernel=kernel) + model = gpr.fit(features_choice,values_choice) + + # test the error by regular grid method + test_points = generate_data(100) + + # calculate the maximum error + maxmin_m_error = max_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) + # calculate the mean squared error + maxmin_ms_error = mean_squared_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) + # calculate the mean absolute error + maxmin_ma_error = mean_absolute_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) + + return maxmin_m_error,maxmin_ms_error,maxmin_ma_error + + + # define optisim_error calculator + def optisim_error(self): + # choose points with optisim sampling + selector = OptiSim() #the total points number + selected_ids = selector.select(self.features,size=self.sample_number,labels=self.labels) + features_choice = self.features[selected_ids] #the selected points'features + values_choice = self.values[selected_ids] #the selected points'values + + # fit with Gaussian Process + kernel = ConstantKernel(constant_value=0.2) + RBF(length_scale=0.5, length_scale_bounds=(1e-7, 1e+7)) + gpr = GaussianProcessRegressor(n_restarts_optimizer=25,kernel=kernel) + model = gpr.fit(features_choice,values_choice) + + # test the error by regular grid method + test_points = generate_data(100) + + # calculate the maximum error + optisim_m_error = max_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) + # calculate the mean squared error + optisim_ms_error = mean_squared_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) + # calculate the mean absolute error + optisim_ma_error = mean_absolute_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) + + return optisim_m_error,optisim_ms_error,optisim_ma_error + +if __name__ == '__main__': + # generate a total random database + rng = np.random.default_rng(seed=42) + total_database = (rng.random((int(1e7),2))-0.5)*6 + + + # using boltzmann sampling to get a sub_database + k = np.arange(1,5) + # k = np.append(k,np.inf) + + sub_database = boltzmann_sample(total_database,k) + + + # using random sampling in the sub_database to get 1e3 points sub_sub_database + sample_times = 30 + sample_number = 1000 + sub_sub_database = {} + for i in range(len(k)): + sub_sub_database[str(k[i])] = random_sample(sub_database[str(k[i])],sample_number,sample_times) + + + selected_number = range(1,1000,1) + + random_error = {} # maximum error, mean squared error, mean absolute error + maxmin_error = {} # maximum error, mean squared error, mean absolute error + optisim_error = {} # maximum error, mean squared error, mean absolute error + + for i in range(len(k)): # iteration with different k + random_error[str(k[i])] = [] + maxmin_error[str(k[i])] = [] + optisim_error[str(k[i])] = [] + + for s in selected_number: # the number of selected_points + random_error_list = [] + maxmin_error_list = [] + optsim_error_list = [] + + for j in range(sample_times): + selector = ErrorCal(sub_sub_database[str(k[i])][j,:],potential_energy(sub_sub_database[str(k[i])][j,:][:,0], + sub_sub_database[str(k[i])][j,:][:,1]),s) + random_error_list.append(selector.random_error()) + maxmin_error_list.append(selector.maxmin_error()) + optsim_error_list.append(selector.optisim_error()) + + random_error_list = np.mean(random_error_list,axis=0) + random_error[str(k[i])].append(random_error_list) + + maxmin_error_list = np.mean(maxmin_error_list,axis=0) + maxmin_error[str(k[i])].append(maxmin_error_list) + + optsim_error_list = np.mean(optsim_error_list,axis=0) + optisim_error[str(k[i])].append(optsim_error_list) + + random_error[str(k[i])] = np.array(random_error[str(k[i])]) + maxmin_error[str(k[i])] = np.array(maxmin_error[str(k[i])]) + optisim_error[str(k[i])] = np.array(optisim_error[str(k[i])]) + + + for i in range(len(k)): + random_data = pd.DataFrame(random_error[str(k[i])],columns=['max_error','mean_squared_error','mean_absolute_error'],index=selected_number) + random_data.to_csv('~/qc_selector/result/sub_sample/random__sample_error_when_k={}.csv'.format(k[i])) + + maxmin_data = pd.DataFrame(maxmin_error[str(k[i])],columns=['max_error','mean_squared_error','mean_absolute_error'],index=selected_number) + maxmin_data.to_csv('~/qc_selector/result/sub_sample/maxmin__sample_error_when_k={}.csv'.format(k[i])) + + optisim_data = pd.DataFrame(optisim_error[str(k[i])],columns=['max_error','mean_squared_error','mean_absolute_error'],index=selected_number) + optisim_data.to_csv('~/qc_selector/result/sub_sample/optisim_sample_error_when_k={}.csv'.format(k[i])) + From eed3af89c220b41d32c5443715e56597a0fb01a9 Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Mon, 12 Aug 2024 15:39:00 -0400 Subject: [PATCH 2/5] Rename evil_example.py to generate_data_pes_4well.py --- notebooks/{evil_example.py => generate_data_pes_4well.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename notebooks/{evil_example.py => generate_data_pes_4well.py} (100%) diff --git a/notebooks/evil_example.py b/notebooks/generate_data_pes_4well.py similarity index 100% rename from notebooks/evil_example.py rename to notebooks/generate_data_pes_4well.py From 186b7e718a10433db59632693fe65295f1a92ef3 Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Wed, 26 Jun 2024 17:39:38 -0400 Subject: [PATCH 3/5] Update 4-Well PES script --- notebooks/generate_data_pes_4well.py | 680 ++++++++++++++++----------- 1 file changed, 402 insertions(+), 278 deletions(-) diff --git a/notebooks/generate_data_pes_4well.py b/notebooks/generate_data_pes_4well.py index b4cda296..e647172b 100644 --- a/notebooks/generate_data_pes_4well.py +++ b/notebooks/generate_data_pes_4well.py @@ -1,282 +1,406 @@ +#!/usr/bin/env python3 + +import os +import sys +import json import numpy as np import matplotlib.pyplot as plt -from sklearn.gaussian_process import GaussianProcessRegressor -import pandas as pd -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import ConstantKernel, RBF -from sklearn.metrics import mean_absolute_error,mean_squared_error,pairwise_distances,max_error -from DiverseSelector.methods.dissimilarity import MaxMin, OptiSim - - -# define potential energy function -def potential_energy(q1,q2): - ''' - q1: float or np.array - - q2: float or np.array - - ''' - v_0 = 5.0 - a_0 = 0.6 - a = [3.0,1.5,3.2,2.0] - b_q1 = 0.1 - b_q2 = 0.1 - sigma_q1 = [0.3,1.0,0.4,1.0] - sigma_q2 = [0.4,1.0,1.0,0.1] - alpha = [1.3,-1.5,1.4,-1.3] - beta = [-1.6,-1.7,1.8,1.23] - - v = v_0 + a_0*(np.exp(-(q1-b_q1)**2-(q2-b_q2)**2)) - - for i in range(4): - v = v - a[i]*np.exp(-sigma_q1[i]*(q1-alpha[i])**2-sigma_q2[i]*(q2-beta[i])**2) - - return v - - -# define boltzmann method -def boltzmann_sample(database,k): - ''' - database : np.array - the database we want to sample - - k : np.array or int, float - when k is larger, the database is more unbiased - the order of 1.5 - ''' - - E_0 = 1.780 # the minimum energy - - # if k == int - if type(k) == int or type(k) == float: - sample_list = [] - - rng = np.random.default_rng() - rn = rng.random((len(database),)) - p = np.exp(-np.array(k)*(1.5)*((potential_energy(database[:,0],database[:,1]))-E_0)) - for i in range(len(database)): - if p[i] >= rn[i]: - sample_list.append(database[i]) - - sample_list = np.array(sample_list) - - return sample_list - - # if k == np.array - sample_list = {} - - for i in range(len(k)): - sample_list[str(k[i])] = [] - - for j in range(len(k)): # if all of the boltzmann sampling databases have the sample_number points then break out - rng = np.random.default_rng() # define a random data generate tool - rn = rng.random((len(database),1)) # generate len(k) random numbers - p = np.exp(-np.array(k[j])*(1.5)*((potential_energy(database[:,0],database[:,1]))-E_0)) # calculate the probability of the point from -3 to 3 - for i in range(len(database)): - if p[i] >= rn[i]: # if p >= random_number than choose the point - sample_list[str(k[j])].append(database[i]) - - for i in range(len(k)): - sample_list[str(k[i])]=np.array(sample_list[str(k[i])]) - - return sample_list - - -# define random sample method -def random_sample(data,sample_number,sample_times): - sample_list=[] - for sample_time in range(sample_times): - label = np.random.choice(np.arange(len(data)),sample_number,replace=False) - sample_list.append(data[label]) - - sample_list = np.array(sample_list) - return sample_list - - -def generate_data(number): - q1 = np.linspace(-3,3,number) - q2 = np.linspace(-3,3,number) - - data = [] - for i in range(len(q1)): - for j in range(len(q2)): - data.append([q1[i],q2[j]]) - - data = np.array(data) - - return data - - -# define a class of error calculator -class ErrorCal(): - ''' - ---------- - features: np.ndarray - Indices of points' number and features - row of features represents the number of points - column of features represents the number of points' features - - values: np.ndarray - values of each points - - sample_number: int - The maximum number of selected points - ''' - def __init__(self,features,values,sample_number,function=None,labels=None): - self.features = features - self.values = values - self.sample_number = sample_number - self.function = function - self.labels = labels - - - # define random_error calculator - def random_error(self): - # choose points with random sampling - points_number = np.arange(len(self.features)) #the total points number - label = np.random.choice(points_number,self.sample_number,replace=False) #random choose the label of points - features_choice = self.features[label] #the selected points'features - values_choice = self.values[label] #the selected points'values - - # fit with Gaussian Process - kernel = ConstantKernel(constant_value=0.2) + RBF(length_scale=0.5, length_scale_bounds=(1e-7, 1e+7)) - gpr = GaussianProcessRegressor(n_restarts_optimizer=25,kernel=kernel) - model = gpr.fit(features_choice,values_choice) - - # test the error by regular grid method - test_points = generate_data(100) - - # calculate the maximum error - random_m_error = max_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) - # calculate the mean squared error - random_ms_error = mean_squared_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) - # calculate the mean absolute error - random_ma_error = mean_absolute_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) - - return random_m_error,random_ms_error,random_ma_error - - - # define maxmin_error calculator - def maxmin_error(self): - # choose points with maxmin sampling - if self.function != None: - features_dist = self.function(self.features) +from sklearn.metrics.pairwise import pairwise_distances +from selector import MaxMin, MaxSum, OptiSim, DISE, Medoid, GridPartition + + +class FourWellPotential: + def __init__(self): + # papers taken from paper + self.v0 = 5.0 + self.a0 = 0.6 + self.a = [3.0, 1.5, 3.2, 2.0] + self.b_q1 = 0.1 + self.b_q2 = 0.1 + self.s_q1 = [0.3, 1.0, 0.4, 1.0] + self.s_q2 = [0.4, 1.0, 1.0, 0.1] + self.alpha = [1.3, -1.5, 1.4, -1.3] + self.beta = [-1.6, -1.7, 1.8, 1.23] + # evaluate potential at A, B, C, D minima (taken from paper) + points = np.array([[1.29, -1.65], [1.4, 1.78], [-1.29, -1.53], [-1.17, 1.56]]) + self.minima = [points, self.evaluate(points)] + + def evaluate(self, points): + if points.ndim != 2 and points.shape[1] != 2: + raise ValueError("Argument points must be 2D array with 2 columns.") + + # TODO: This needs to be vectorized + values = [] + for q1, q2 in points: + v = self.v0 + self.a0 * (np.exp(-((q1 - self.b_q1) ** 2) - (q2 - self.b_q2) ** 2)) + for i in range(4): + v -= self.a[i] * np.exp( + -self.s_q1[i] * (q1 - self.alpha[i]) ** 2 + - self.s_q2[i] * (q2 - self.beta[i]) ** 2 + ) + values.append(v) + return np.array(values) + + def generate_points_uniform( + self, size_q1, size_q2, range_q1=(-3.0, 3.0), range_q2=(-3.0, 3.0), noise=0.0 + ): + # sample q1 and q2 coordinates along each axes + q1 = np.linspace(range_q1[0], range_q1[1], size_q1) + q2 = np.linspace(range_q2[0], range_q2[1], size_q2) + # combine q1 and q2 coordinates into a grid + data = np.array(np.meshgrid(q1, q2)).T.reshape(-1, 2) + return data + + +def select_monte_carlo(potential, min_potential, k, seed=42): + npoints = len(potential) + # compute Boltzman probability + prob = np.exp(-1.5 * k * (potential - min_potential)) + + rng = np.random.default_rng(seed=seed) + random = rng.random(size=npoints) + indices = np.where(prob > random)[0] + print("Monte Carlo selection with k:", k) + print("Minimum potential :", min_potential) + print("Total number of points :", npoints) + print("Number of selected points :", len(indices)) + return indices + + +def generate_datasets(size_random, k_values, size_dataset, niter=10, seed=42): + # Create directory for storing plots and npz data files, if it does not exist + os.makedirs("data", exist_ok=True) + + # Make an instance of the 4-well potential + # ---------------------------------------- + pes = FourWellPotential() + print("Coordinates of the 4 minima :", pes.minima[0]) + print("Potential at the 4 minima :", pes.minima[1]) + + # Plot 2D potential using uniform grid + # ------------------------------------ + points = pes.generate_points_uniform(size_q1=100, size_q2=100) + values = pes.evaluate(points) + print("Shape of PES points (for plotting):", points.shape) + print("Shape of PES values (for plotting): ", values.shape) + fname = "data/pes_4well.png" + plot_data_2d(points, values, highlight=pes.minima[0], title="Four Well Potential", fname=fname) + # 1D plot to see minimum D + # ------------------------ + # q1_fixed = -1.17 + # points_1d = pes.generate_points(size_q1=1, size_q2=50, range_q1=(q1_fixed, q1_fixed)) + # values_1d = pes.evaluate(points_1d) + # plt.plot(points_1d[:, 1], values_1d) + # plt.xlabel("q2", fontsize=14) + # plt.ylabel("Potential (kcal/mol)", fontsize=14) + # plt.title(f"Intersection of PES at q1={q1_fixed}", fontsize=18) + # plt.show() + + # Generate Dataset + # ---------------- + # Select a set of random 2D points between -3.0 and 3.0. For large sizes (e.g., 10M), + # this gives a dense and uniform coverage of the 4-well potential. + # rng.random generates points in [0, 1] interval, so they are shifted ans scaled + # to cover -3.0 to 3.0 range + rng = np.random.default_rng(seed=seed) + points_random = (rng.random((int(size_random), 2)) - 0.5) * 6.0 + values_random = pes.evaluate(points_random) + # plot_data_2d(points, values, highlight=points_random, title="Four Well Potential (kcal/mol)") + + # Use Monte-Carlo (MC) method to select points for a given k, which results in a different + # number of points for each k. The idea is that as k increases, the dataset becomes + # more biased so we can better demonstrate the importance of diverse subset sampling. + # Note that k=0 gives back the entire set, because probability of all points is one. + # However, as k increases the number of points selected by MC becomes smaller. This causes + # an unfair comparison, when we select a percentage of the data (for each k) as a subset. + # E.g., 1% of 10M (k=0) is already going to be diverse, no matter how the selection is + # made while 1% of 10K (k=30) is not going to be diverse in general. + # So, we select a fixed number of data generated by MC (for all k values) for the rest + # of our analysis. Keep in mind that this selection will be random so that it has a + # similar distribution (i.e, the same statistical bias) as the original MC sample set. + # Knowing that k=20 for 10M points, selected ~15K points, we choose 10K for fixed size. + for k in k_values: + indices_mc = select_monte_carlo(values_random, np.min(pes.minima[1]), k=k, seed=seed) + # points_mc = points_random[indices_mc] + dataset = {} + # make subdirectory for each k to store data + folder = f"data/monte_carlo_k{k:02d}" + os.makedirs(folder, exist_ok=True) + os.makedirs(f"{folder}/plots", exist_ok=True) + # randomly select a fixed-size dataset from each k niter times + for index in range(niter): + indices_iter = np.random.choice(indices_mc, size_dataset, replace=False) + print(f" Select {len(indices_iter)} points iteration {index}") + # plot MC selected points and the fixed-size dataset + fname = f"dataset_{size_random:2.1e}_k{k:02d}_size{size_dataset:05d}_iter{index:02d}" + plot_data_2d( + points, + values, + highlight=(points_random[indices_mc], points_random[indices_iter]), + title=f"Monte Carlo (k={k}): {len(indices_iter)} out of {len(indices_mc)}", + fname=f"{folder}/plots/{fname}.png", + ) + # store data for given k and iteration + # dataset[fname] = points_random[indices_iter] + dataset[fname] = { + "points": points_random[indices_iter].tolist(), + "values": values_random[indices_iter].tolist(), + } + # save data for given k + # np.savez_compressed(f"data/k{k:02.1f}/{fname.split('_iter')[0]}", **dataset) + print(f"Write {niter} iterations JSON data into data/{folder}") + with open(f"{folder}/{fname.split('_iter')[0]}.json", "w") as fp: + json.dump(dataset, fp, indent=4, sort_keys=True) + + +def select_subsets(points, percentages, methods, seed=42): + rng = np.random.default_rng(seed=seed) + selected_indices = {} + # loop over methods first to avoid recomputing points_dist for MaxMin and MaxSum + for method in methods: + print(f" Selection Method: {method}") + if method not in selected_indices: + selected_indices[method] = {} + points_dist = None + if method in ["MaxMin", "MaxSum"]: + points_dist = pairwise_distances(points, metric="euclidean") + for p in percentages: + # make an entry for each percentage + if p not in selected_indices[method]: + # str(p) is used as key because JSON does not allow storing int keys + selected_indices[method][str(p)] = {} + size = int(p * len(points) / 100) + print(f" Percentage={p: 2d}: Select {size} from {len(points)}") + if method == "Random": + indices = rng.choice(np.arange(len(points)), size, replace=False, shuffle=True) + elif method == "MaxMin": + indices = MaxMin(fun_dist=None).select(points_dist, size=size) + elif method == "MaxSum": + indices = MaxSum().select(points_dist, size=size) + elif method == "DISE": + indices = DISE().select(points, size=size) + elif method == "OptiSim": + indices = OptiSim().select(points, size=size) + elif method == "Medoid": + indices = Medoid().select(points, size=size) + elif method.startswith("GridPartition"): + grid_method = method.split("-")[1] + indices = GridPartition(numb_bins_axis=5, grid_method=grid_method).select( + points, size=size + ) + else: + raise ValueError(f"Unknown selection method: {method}") + # str(p) is used as key because JSON does not allow storing int keys + # indices are converted to int because JSON does not allow storing numpy.int64 + selected_indices[method][str(p)] = [int(item) for item in indices] + return selected_indices + + +def compute_diversity(points, selected_indices, measures): + diversity = {} + for measure in measures: + if measure == "LogDet": + diversity[measure] = logdet(points[selected_indices]) + elif measure == "WDUD": + diversity[measure] = wdud(points[selected_indices]) + elif measure == "MinDist": + points_dist = pairwise_distances(points[selected_indices], metric="euclidean") + upper_triangular = points_dist[np.triu_indices_from(points_dist, k=1)] + diversity[measure] = np.min(upper_triangular) + elif measure == "MeanDist": + points_dist = pairwise_distances(points[selected_indices], metric="euclidean") + upper_triangular = points_dist[np.triu_indices_from(points_dist, k=1)] + diversity[measure] = np.mean(upper_triangular) else: - features_dist = pairwise_distances(X=self.features,Y=None,metric="euclidean") - - selector = MaxMin() - selected_ids = selector.select(arr=features_dist,size=self.sample_number,labels=self.labels) - features_choice = self.features[selected_ids] - values_choice = self.values[selected_ids] - - # fit with Gaussian Process - kernel = ConstantKernel(constant_value=0.2) + RBF(length_scale=0.5, length_scale_bounds=(1e-7, 1e+7)) - gpr = GaussianProcessRegressor(n_restarts_optimizer=25,kernel=kernel) - model = gpr.fit(features_choice,values_choice) - - # test the error by regular grid method - test_points = generate_data(100) - - # calculate the maximum error - maxmin_m_error = max_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) - # calculate the mean squared error - maxmin_ms_error = mean_squared_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) - # calculate the mean absolute error - maxmin_ma_error = mean_absolute_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) - - return maxmin_m_error,maxmin_ms_error,maxmin_ma_error - - - # define optisim_error calculator - def optisim_error(self): - # choose points with optisim sampling - selector = OptiSim() #the total points number - selected_ids = selector.select(self.features,size=self.sample_number,labels=self.labels) - features_choice = self.features[selected_ids] #the selected points'features - values_choice = self.values[selected_ids] #the selected points'values - - # fit with Gaussian Process - kernel = ConstantKernel(constant_value=0.2) + RBF(length_scale=0.5, length_scale_bounds=(1e-7, 1e+7)) - gpr = GaussianProcessRegressor(n_restarts_optimizer=25,kernel=kernel) - model = gpr.fit(features_choice,values_choice) - - # test the error by regular grid method - test_points = generate_data(100) - - # calculate the maximum error - optisim_m_error = max_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) - # calculate the mean squared error - optisim_ms_error = mean_squared_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) - # calculate the mean absolute error - optisim_ma_error = mean_absolute_error(model.predict(test_points),potential_energy(test_points[:,0],test_points[:,1])) - - return optisim_m_error,optisim_ms_error,optisim_ma_error - -if __name__ == '__main__': - # generate a total random database - rng = np.random.default_rng(seed=42) - total_database = (rng.random((int(1e7),2))-0.5)*6 - - - # using boltzmann sampling to get a sub_database - k = np.arange(1,5) - # k = np.append(k,np.inf) - - sub_database = boltzmann_sample(total_database,k) - - - # using random sampling in the sub_database to get 1e3 points sub_sub_database - sample_times = 30 - sample_number = 1000 - sub_sub_database = {} - for i in range(len(k)): - sub_sub_database[str(k[i])] = random_sample(sub_database[str(k[i])],sample_number,sample_times) - - - selected_number = range(1,1000,1) - - random_error = {} # maximum error, mean squared error, mean absolute error - maxmin_error = {} # maximum error, mean squared error, mean absolute error - optisim_error = {} # maximum error, mean squared error, mean absolute error - - for i in range(len(k)): # iteration with different k - random_error[str(k[i])] = [] - maxmin_error[str(k[i])] = [] - optisim_error[str(k[i])] = [] - - for s in selected_number: # the number of selected_points - random_error_list = [] - maxmin_error_list = [] - optsim_error_list = [] - - for j in range(sample_times): - selector = ErrorCal(sub_sub_database[str(k[i])][j,:],potential_energy(sub_sub_database[str(k[i])][j,:][:,0], - sub_sub_database[str(k[i])][j,:][:,1]),s) - random_error_list.append(selector.random_error()) - maxmin_error_list.append(selector.maxmin_error()) - optsim_error_list.append(selector.optisim_error()) - - random_error_list = np.mean(random_error_list,axis=0) - random_error[str(k[i])].append(random_error_list) - - maxmin_error_list = np.mean(maxmin_error_list,axis=0) - maxmin_error[str(k[i])].append(maxmin_error_list) - - optsim_error_list = np.mean(optsim_error_list,axis=0) - optisim_error[str(k[i])].append(optsim_error_list) - - random_error[str(k[i])] = np.array(random_error[str(k[i])]) - maxmin_error[str(k[i])] = np.array(maxmin_error[str(k[i])]) - optisim_error[str(k[i])] = np.array(optisim_error[str(k[i])]) - - - for i in range(len(k)): - random_data = pd.DataFrame(random_error[str(k[i])],columns=['max_error','mean_squared_error','mean_absolute_error'],index=selected_number) - random_data.to_csv('~/qc_selector/result/sub_sample/random__sample_error_when_k={}.csv'.format(k[i])) + raise ValueError(f"Unknown diversity measure: {measure}") + return diversity + - maxmin_data = pd.DataFrame(maxmin_error[str(k[i])],columns=['max_error','mean_squared_error','mean_absolute_error'],index=selected_number) - maxmin_data.to_csv('~/qc_selector/result/sub_sample/maxmin__sample_error_when_k={}.csv'.format(k[i])) - - optisim_data = pd.DataFrame(optisim_error[str(k[i])],columns=['max_error','mean_squared_error','mean_absolute_error'],index=selected_number) - optisim_data.to_csv('~/qc_selector/result/sub_sample/optisim_sample_error_when_k={}.csv'.format(k[i])) - +def plot_data_2d(points, values, highlight=None, title="", fname=None): + """Plot 2D data with colorbar. + + Parameters + ---------- + points : np.ndarray + 2D array with 2 columns representing coordinates. + values : np.ndarray + 1D array with potential values for each point used for coloring. + highlight : np.ndarray + 2D array with 2 columns representing coordinates of points to highlight + with a star marker. + title : str + Title of the plot. + """ + if points.ndim != 2 and points.shape[1] != 2: + raise ValueError("Argument points must be 2D array with 2 columns") + plt.scatter(points[:, 0], points[:, 1], c=values, cmap="RdBu") + plt.colorbar(label="Potential (kcal/mol)") + if highlight is not None: + if isinstance(highlight, np.ndarray) and highlight.ndim == 2 and highlight.shape[1] == 2: + # raise ValueError("Argument highlight must be 2D array with 2 columns") + plt.scatter(highlight[:, 0], highlight[:, 1], c="black", marker="*", s=20) # , s=100) + else: + for index, item in enumerate(highlight): + assert item.ndim == 2 and item.shape[1] == 2 + if index == 0: + plt.scatter(item[:, 0], item[:, 1], c="black", marker="*", s=50) + else: + plt.scatter( + item[:, 0], + item[:, 1], + marker="o", + facecolors="none", + edgecolors="cyan", + linewidths=2, + s=10, + ) + + plt.title(title, fontsize=14) + plt.xlabel("q1", fontsize=14) + plt.ylabel("q2", fontsize=14) + + if fname is not None: + plt.savefig(fname, dpi=300, bbox_inches="tight") + else: + plt.show() + plt.close() + + +if __name__ == "__main__": + # get command line arguments + args = sys.argv[1:] + + if args[0] == "generate_data": + # These parameter choices are discussed in generate_datasets function + size_random, size_dataset = 1.0e7, 10000 + k_values = np.arange(0.0, 21.0, 1.0, dtype=int) + generate_datasets(size_random, k_values, size_dataset, niter=10, seed=42) + + elif args[0] == "select_subsets": + fnames = sorted(args[1:]) + # list of selector methods and percentages to select + selectors = [ + "Random", + "MaxMin", + "MaxSum", + "OptiSim", + "DISE", + "Medoid", + "GridPartition-equisized_dependent", + "GridPartition-equisized_independent", + "GridPartition-equifrequent_dependent", + "GridPartition-equifrequent_independent", + ] + percentages = [2**i for i in range(6)] + + # Make an instance of potential for visualization + # ----------------------------------------------- + pes = FourWellPotential() + points_plot = pes.generate_points_uniform(size_q1=100, size_q2=100) + values_plot = pes.evaluate(points_plot) + + # Loop over datasets, select subsets, plot and store selected indices + # ------------------------------------------------------------------- + # example of fname: data/monte_carlo_k00/dataset_1.0e+06_k00_size01000.json + for fname in fnames: + # Load dataset from JSON file + # --------------------------- + print(f"LOAD {fname}") + with open(fname, "r") as fp: + data = json.load(fp) + folder, fname = os.path.split(fname) + os.makedirs(f"{folder}/plots", exist_ok=True) + # Loop over iterations of dataset + # ------------------------------- + database_indices = {} + for fname_iter, values_iter in data.items(): + print(f" SELECT from: {fname_iter}") + points_iter = np.array(values_iter["points"]) + # values_iter = np.array(values_iter["values"]) + # example of fname_points: dataset_1.0e+07_k10.0_size10000_iter00 + # Select subsets + # -------------- + # indices are returned as a nested dictionary {method: {percentage: indices}} + selected_indices = select_subsets(points_iter, percentages, selectors, seed=42) + database_indices[fname_iter] = selected_indices + # Plot Subsets + # ------------ + for method, values in selected_indices.items(): + for p, indices in values.items(): + p = int(p) + fname_plot = f"{folder}/plots/subset_{method}_p{p:02d}_{fname_iter}.png" + plot_data_2d( + points_plot, + values_plot, + highlight=(points_iter, points_iter[indices]), + title=f"{fname_iter}\nMethod={method}, Percentage={p}, Size={len(indices)}", + fname=fname_plot, + ) + # Save selected indices from fname database + # ----------------------------------------- + for method in selectors: + temp = {fname: values[method] for fname, values in database_indices.items()} + fname_json = fname.replace("dataset", f"subset_{method}_dataset") + fname_json = f"{folder}/{fname_json}" + print(f"WRITE {fname_json}") + with open(fname_json, "w") as fp: + json.dump(temp, fp, indent=4, sort_keys=True) + + elif args[0] == "compute_diversity": + # make a dictionary for each dataset, so that diversity of iterations can be stored + # diversity_iterations = diversity.setdefault(fname.replace("data/", ""), {}) + # measures = ["LogDet", "WDUD", "MinDist", "MeanDist"] + # # Compute diversity measures and plot subsets + # # ------------------------------------------- + # for method, values in selected_indices.items(): + # if method not in diversity_iterations: + # diversity_iterations[method] = {} + # for p, indices in values.items(): + # if p not in diversity_iterations[method]: + # diversity_iterations[method][p] = {} + # # compute diversity returns a dictionary {measure: value} + # print(f" Compute diversity for method={method} and p={p}") + # d = compute_diversity(points_selected, indices, measures) + # for measure, value in d.items(): + # items = diversity_iterations[method][p].setdefault(measure, []) + # items.append(value) + # # plot subsets + # fname = f"data/{folder}/{method}_p{p:02d}_{fname_selected}.png" + # plot_data_2d( + # points_plot, + # values_plot, + # highlight=(points_selected, points_selected[indices]), + # title=f"{fname_selected}\nMethod={method}, Percentage={p}, Size={len(indices)}", + # fname=fname, + # ) + + # # Save diversity data + # # ------------------- + # # data is {fname: {method: {percentage: {measure: iteration_diversity_values}}}} + # print("Save diversity data to data/diversity.json") + # with open("data/diversity.json", "w") as fp: + # json.dump(diversity, fp, indent=4, sort_keys=True) + + # data is {fname: {method: {percentage: {measure: iteration_diversity_values}}}} + with open("data/diversity.json", "r") as fp: + data = json.load(fp) + print(len(data)) + measures = ["LogDet", "WDUD"] + for measure in measures: + # for each dataset, plot diversity measures vs. percentage + for fname, diversity in data.items(): + print(fname) + print(diversity) + print(type(diversity)) + for method, values in diversity.items(): + x_values = sorted([int(p) for p in values.keys()]) + print("percentage values = ", x_values) + y_values = [np.mean(values[str(p)][measure]) for p in x_values] + plt.plot(x_values, y_values, label=method) + plt.xlabel("Percentage (%)") + plt.ylabel(measure) + plt.title(fname) + plt.legend() + plt.show() + else: + raise ValueError(f"Unknown task={args}") From 92b567f319deb20cb4c9f44c0e10d95920efc3d2 Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Wed, 26 Jun 2024 17:40:17 -0400 Subject: [PATCH 4/5] Update generate_data_pes_4well.py --- notebooks/generate_data_pes_4well.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 notebooks/generate_data_pes_4well.py diff --git a/notebooks/generate_data_pes_4well.py b/notebooks/generate_data_pes_4well.py old mode 100644 new mode 100755 From 3803761d388f27ec10f9f559fc3617f7e16eec3a Mon Sep 17 00:00:00 2001 From: Farnaz Heidar-Zadeh Date: Tue, 10 Sep 2024 08:55:07 -0400 Subject: [PATCH 5/5] Update script to make plot folders for each selection method --- notebooks/generate_data_pes_4well.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/notebooks/generate_data_pes_4well.py b/notebooks/generate_data_pes_4well.py index e647172b..58859f50 100755 --- a/notebooks/generate_data_pes_4well.py +++ b/notebooks/generate_data_pes_4well.py @@ -185,7 +185,7 @@ def select_subsets(points, percentages, methods, seed=42): indices = Medoid().select(points, size=size) elif method.startswith("GridPartition"): grid_method = method.split("-")[1] - indices = GridPartition(numb_bins_axis=5, grid_method=grid_method).select( + indices = GridPartition(nbins_axis=5, bin_method=grid_method).select( points, size=size ) else: @@ -291,7 +291,7 @@ def plot_data_2d(points, values, highlight=None, title="", fname=None): "GridPartition-equifrequent_dependent", "GridPartition-equifrequent_independent", ] - percentages = [2**i for i in range(6)] + percentages = [2**i for i in range(7)] # Make an instance of potential for visualization # ----------------------------------------------- @@ -323,12 +323,18 @@ def plot_data_2d(points, values, highlight=None, title="", fname=None): # indices are returned as a nested dictionary {method: {percentage: indices}} selected_indices = select_subsets(points_iter, percentages, selectors, seed=42) database_indices[fname_iter] = selected_indices + print("DONE SELECTING") # Plot Subsets # ------------ for method, values in selected_indices.items(): + print("PLOTTING.....") + os.makedirs(f"{folder}/plots/{method}", exist_ok=True) for p, indices in values.items(): p = int(p) - fname_plot = f"{folder}/plots/subset_{method}_p{p:02d}_{fname_iter}.png" + fname_plot = ( + f"{folder}/plots/{method}/subset_{method}_p{p:02d}_{fname_iter}.png" + ) + print(f"PLOT {fname_plot}") plot_data_2d( points_plot, values_plot,