Overfitting problem in practice
1. Constructing the dataset
We use a dataset with sample feature vectors of length 2 and labels 0 or 1, representing 2 species, respectively. With the make_moons tool provided in the scikit-learn library we can generate training sets with any number of data.
import as plt # Imported dataset generation tools import numpy as np import seaborn as sns from import make_moons from sklearn.model_selection import train_test_split from import layers, Sequential, regularizers from mpl_toolkits.mplot3d import Axes3D
In order to demonstrate the overfitting phenomenon, we only sampled 1000 samples of data and added Gaussian noise data with a standard deviation of 0.25:
def load_dataset(): # of sampling points N_SAMPLES = 1000 # of tests ratio TEST_SIZE = None # Randomly sample 1000 points from the moon distribution and slice into training-test set X, y = make_moons(n_samples=N_SAMPLES, noise=0.25, random_state=100) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42) return X, y, X_train, X_test, y_train, y_test
The make_plot function makes it easy to plot the distribution of the data based on the coordinates X of the samples and the labels y of the samples:
def make_plot(X, y, plot_name, file_name, XX=None, YY=None, preds=None, dark=False, output_dir=OUTPUT_DIR): # Plot the distribution of the dataset, x is the 2D coordinate, y is the label of the data point if dark: ('dark_background') else: sns.set_style("whitegrid") axes = () axes.set_xlim([-2, 3]) axes.set_ylim([-1.5, 2]) (xlabel="$x_1$", ylabel="$x_2$") (plot_name, fontsize=20, fontproperties='SimHei') plt.subplots_adjust(left=0.20) plt.subplots_adjust(right=0.80) if XX is not None and YY is not None and preds is not None: (XX, YY, (), 25, alpha=0.08, cmap=) (XX, YY, (), levels=[.5], cmap="Greys", vmin=0, vmax=.6) # Plot scatterplots, differentiate colors based on labels m=markers markers = ['o' if i == 1 else 's' for i in ()] mscatter(X[:, 0], X[:, 1], c=(), s=20, cmap=, edgecolors='none', m=markers, ax=axes) # Save the vectors (output_dir + '/' + file_name) ()
def mscatter(x, y, ax=None, m=None, **kw): import as mmarkers if not ax: ax = () sc = (x, y, **kw) if (m is not None) and (len(m) == len(x)): paths = [] for marker in m: if isinstance(marker, ): marker_obj = marker else: marker_obj = (marker) path = marker_obj.get_path().transformed( marker_obj.get_transform()) (path) sc.set_paths(paths) return sc
X, y, X_train, X_test, y_train, y_test = load_dataset() make_plot(X,y,"haha",'Distribution of crescent-shaped binary dataset.svg')
2. Impact of the number of network layers
In order to explore the degree of overfitting at different network depths, we conducted a total of 5 training experiments. At 𝑛 ∈ [0,4], a fully connected layer network with n + 2 network layers is constructed and 500 Epochs are trained by the Adam optimizer
def network_layers_influence(X_train, y_train): # Construct 5 networks with different number of layers for n in range(5): # Create containers model = Sequential() # Create the first layer ((8, input_dim=2, activation='relu')) # Add n layers for a total of n+2 layers for _ in range(n): ((32, activation='relu')) # Create the final layer ((1, activation='sigmoid')) # Model assembly and training (loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) (X_train, y_train, epochs=N_EPOCHS, verbose=1) # Plotting decision boundary curves for networks with different number of layers # Visualize the x-coordinate in the range [-2, 3] xx = (-2, 3, 0.01) # Visualize the y-coordinate in the range [-1.5, 2] yy = (-1.5, 2, 0.01) # Generate x-y plane sampling grid points for easy visualization XX, YY = (xx, yy) preds = model.predict_classes(np.c_[(), ()]) print(preds) title = "network layer:{0}".format(2 + n) file = "Network capacity_%" % (2 + n) make_plot(X_train, y_train, title, file, XX, YY, preds, output_dir=OUTPUT_DIR + '/network_layers')
implication
To explore the effect of dropout layers on network training, we conducted five experiments, each using a 7-layer fully connected network, but inserting 0-4 dropout layers at intervals in the fully connected layers and training 500 epochs with the Adam optimizer.
def dropout_influence(X_train, y_train): # Build a network with 5 different numbers of Dropout layers for n in range(5): # Create containers model = Sequential() # Create the first layer ((8, input_dim=2, activation='relu')) counter = 0 # The number of network layers is fixed at 5 for _ in range(5): ((64, activation='relu')) # Add n Dropout layers if counter < n: counter += 1 ((rate=0.5)) # Output layer ((1, activation='sigmoid')) # Model assembly (loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Training (X_train, y_train, epochs=N_EPOCHS, verbose=1) # Plotting Decision Boundary Curves for Different Dropout Layers # Visualize the x-coordinate in the range [-2, 3] xx = (-2, 3, 0.01) # Visualize the y-coordinate in the range [-1.5, 2] yy = (-1.5, 2, 0.01) # Generate x-y plane sampling grid points for easy visualization XX, YY = (xx, yy) preds = model.predict_classes(np.c_[(), ()]) title = "No Dropout Layer" if n == 0 else "{0}floor (of a building) Dropoutfloor (of a building)".format(n) file = "Dropout_%" % n make_plot(X_train, y_train, title, file, XX, YY, preds, output_dir=OUTPUT_DIR + '/dropout')
4. Impact of regularization
In order to explore the effect of the regularization coefficient 𝜆 on the training of the network model, we constructed a 5-layered neural network using L2 regularization, where the weight tensor W of the 2nd,3rdand 4th neural network layers are added with L2 regularization constraint terms:
def build_model_with_regularization(_lambda): # Create neural networks with regularization terms model = Sequential() ((8, input_dim=2, activation='relu')) # Without regularization terms # Layers 2-4 are all with L2 regularization terms ((256, activation='relu', kernel_regularizer=regularizers.l2(_lambda))) ((256, activation='relu', kernel_regularizer=regularizers.l2(_lambda))) ((256, activation='relu', kernel_regularizer=regularizers.l2(_lambda))) # Output layer ((1, activation='sigmoid')) (loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Model assembly return model
Below we first implement a weight visualization function
def plot_weights_matrix(model, layer_index, plot_name, file_name, output_dir=OUTPUT_DIR): # Plotting weight range functions # Extract the weight matrix for a given layer weights = [layer_index].get_weights()[0] shape = # Generate grid coordinates equal in size to the weight matrix X = (range(shape[1])) Y = (range(shape[0])) X, Y = (X, Y) # 3D mapping fig = () ax = (projection='3d') .set_pane_color((1.0, 1.0, 1.0, 0.0)) .set_pane_color((1.0, 1.0, 1.0, 0.0)) .set_pane_color((1.0, 1.0, 1.0, 0.0)) (plot_name, fontsize=20, fontproperties='SimHei') # Plotting the range of weights matrix ax.plot_surface(X, Y, weights, cmap=plt.get_cmap('rainbow'), linewidth=0) # Set the axis name ax.set_xlabel('Grid x-coordinate', fontsize=16, rotation=0, fontproperties='SimHei') ax.set_ylabel('Grid y-coordinate', fontsize=16, rotation=0, fontproperties='SimHei') ax.set_zlabel('Weights', fontsize=16, rotation=90, fontproperties='SimHei') # Save matrix range charts (output_dir + "/" + file_name + ".svg") (fig)
Keeping the structure of the network unchanged, we adjust the regularization coefficients by𝜆 = 0.00001,0.001,0.1,0.12,0.13
to test the training effectiveness of the network and plot the decision boundary curves of the learning model on the training set
def regularizers_influence(X_train, y_train): for _lambda in [1e-5, 1e-3, 1e-1, 0.12, 0.13]: # Setting different regularization factors # Create models with regularized terms model = build_model_with_regularization(_lambda) # Model training (X_train, y_train, epochs=N_EPOCHS, verbose=1) # Plotting the range of weights layer_index = 2 plot_title = "regularization factor:{}".format(_lambda) file_name = "Regularized network weights_." + str(_lambda) # Mapping the range of network weights plot_weights_matrix(model, layer_index, plot_title, file_name, output_dir=OUTPUT_DIR + '/regularizers') # Plotting decision boundary lines with different regularization coefficients # Visualize the x-coordinate in the range [-2, 3] xx = (-2, 3, 0.01) # Visualize the y-coordinate in the range [-1.5, 2] yy = (-1.5, 2, 0.01) # Generate x-y plane sampling grid points for easy visualization XX, YY = (xx, yy) preds = model.predict_classes(np.c_[(), ()]) title = "regularization factor:{}".format(_lambda) file = "Regularization_%" % _lambda make_plot(X_train, y_train, title, file, XX, YY, preds, output_dir=OUTPUT_DIR + '/regularizers')
regularizers_influence(X_train, y_train)
To this point this article on the detailed tensorflow overfitting problem combat article is introduced to this, more related tensorflow overfitting content please search for my previous articles or continue to browse the following related articles I hope you will support me more in the future!