#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib.pyplot import figure import seaborn as sns from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.model_selection import cross_val_score from sklearn import preprocessing from sklearn.model_selection import cross_val_predict from sklearn.model_selection import GridSearchCV import time from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn import svm from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn import metrics data = pd.read_csv('dataset_sdn.csv') data.head() data.shape data.info() ##### Here we see that the label contains boolean values: 0 - Benign, 1-Maliciuous data.label.unique() data.label.value_counts() label_dict = dict(data.label.value_counts()) sns.countplot(data.label) labels = ["Maliciuous",'Benign'] sizes = [dict(data.label.value_counts())[0], dict(data.label.value_counts())[1]] plt.figure(figsize = (13,8)) plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90) plt.legend(["Maliciuous", "Benign"]) plt.title('The percentage of Benign and Maliciuos Requests in dataset') # plt.show() data.describe() # Let's look at the vizualisation of Null valued features figure(figsize=(9, 5), dpi=80) data[data.columns[data.isna().sum() >= 0]].isna().sum().sort_values().plot.bar() plt.title("Features which has NuLL values") data.isnull().sum() numeric_df = data.select_dtypes(include=['int64', 'float64']) object_df = data.select_dtypes(include=['object']) numeric_cols = numeric_df.columns object_cols = object_df.columns print('Numeric Columns: ') print(numeric_cols, '\n') print('Object Columns: ') print(object_cols, '\n') print('Number of Numeric Features: ', len(numeric_cols)) print('Number of Object Features: ', len(object_cols)) # In[14]: object_df.head() # In[15]: #### Let's look at Oblect columns (Source Destination Protocol) figure(figsize=(12, 7), dpi=80) plt.barh(list(dict(data.src.value_counts()).keys()), dict(data.src.value_counts()).values(), color='lawngreen') for idx, val in enumerate(dict(data.src.value_counts()).values()): plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13) plt.xlabel('Number of Requests') plt.ylabel('IP addres of sender') plt.title('Number of all reqests') # In[16]: figure(figsize=(12, 7), dpi=80) plt.barh(list(dict(data[data.label == 1].src.value_counts()).keys()), dict(data[data.label == 1].src.value_counts()).values(), color='blue') for idx, val in enumerate(dict(data[data.label == 1].src.value_counts()).values()): plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13) plt.xlabel('Number of Requests') plt.ylabel('IP addres of sender') plt.title('Number of Attack requests') # In[17]: figure(figsize=(12, 7), dpi=80) plt.barh(list(dict(data.src.value_counts()).keys()), dict(data.src.value_counts()).values(), color='lawngreen') plt.barh(list(dict(data[data.label == 1].src.value_counts()).keys()), dict(data[data.label == 1].src.value_counts()).values(), color='blue') for idx, val in enumerate(dict(data.src.value_counts()).values()): plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13) for idx, val in enumerate(dict(data[data.label == 1].src.value_counts()).values()): plt.text(x = val, y = idx-0.2, s = str(val), color='w', size = 13) plt.xlabel('Number of Requests') plt.ylabel('IP addres of sender') plt.legend(['All','malicious']) plt.title('Number of requests from different IP adress') # In[18]: figure(figsize=(10, 6), dpi=80) plt.bar(list(dict(data.Protocol.value_counts()).keys()), dict(data.Protocol.value_counts()).values(), color='r') plt.bar(list(dict(data[data.label == 1].Protocol.value_counts()).keys()), dict(data[data.label == 1].Protocol.value_counts()).values(), color='b') plt.text(x = 0 - 0.15, y = 41321 + 200, s = str(41321), color='black', size=17) plt.text(x = 1 - 0.15, y = 33588 + 200, s = str(33588), color='black', size=17) plt.text(x = 2 - 0.15, y = 29436 + 200, s = str(29436), color='black', size=17) plt.text(x = 0 - 0.15, y = 9419 + 200, s = str(9419), color='w', size=17) plt.text(x = 1 - 0.15, y = 17499 + 200, s = str(17499), color='w', size=17) plt.text(x = 2 - 0.15, y = 13866 + 200, s = str(13866), color='w', size=17) plt.xlabel('Protocol') plt.ylabel('Count') plt.legend(['All', 'malicious']) plt.title('The number of requests from different protocols') # In[19]: df = data.copy() # In[20]: figure(figsize=(8, 4), dpi=80) plt.hist(df.dur, bins=20, color='b') plt.title('Duration') # plt.show() # In[21]: figure(figsize=(8, 4), dpi=80) plt.hist(df.tx_bytes, bins=20, color='r') plt.title('TX_BYTES - Transmitted Bytes') # plt.show() # In[22]: figure(figsize=(8, 4), dpi=80) plt.hist(df.tx_kbps, bins=10, color='g') plt.title('TX_KBPC') # plt.show() # In[23]: plt.hist(df.switch, bins=20, color='r') plt.title('SWITCH') plt.xlabel('SWITCH') # plt.show() # In[24]: plt.hist(df[df['label'] == 1].switch, bins=20, color='r') plt.title('SWITCH') plt.xlabel('SWITCH') # plt.show() import joblib class Model: global y def __init__(self, data): self.data = data X = preprocessing.StandardScaler().fit(self.data).transform(self.data) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=42, test_size=0.3) def LogisticRegression(self): solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] start_time = time.time() results_lr = [] accuracy_list = [] for solver in solvers: LR = LogisticRegression(C=0.03, solver=solver).fit(self.X_train, self.y_train) predicted_lr = LR.predict(self.X_test) accuracy_lr = accuracy_score(self.y_test, predicted_lr) results_lr.append({'solver' : solver, 'accuracy': str(round(accuracy_lr * 100, 2)) + "%", 'Coefficients': {'W' : LR.coef_, 'b': LR.intercept_}}) accuracy_list.append(accuracy_lr) solver_name = solvers[accuracy_list.index(max(accuracy_list))] LR = LogisticRegression(C=0.03, solver=solver_name).fit(self.X_train, self.y_train) predicted_lr = LR.predict(self.X_test) accuracy_lr = accuracy_score(self.y_test, predicted_lr) print("Accuracy: %.2f%%" % (accuracy_lr * 100.0), '\n') print("########################################################################") print('Best solver is : ', solver_name) print("########################################################################") print(classification_report(predicted_lr, self.y_test), '\n') print("########################################################################") print("--- %s seconds --- time for LogisticRegression" % (time.time() - start_time)) # Save the model joblib.dump(LR, 'logistic_regression_model.pkl') def SupportVectorMachine(self): start_time = time.time() accuracy_list = [] result_svm = [] kernels = ['linear', 'poly','rbf', 'sigmoid'] for kernel in kernels: SVM = svm.SVC(kernel=kernel).fit(self.X_train, self.y_train) predicted_svm = SVM.predict(self.X_test) accuracy_svm = accuracy_score(self.y_test, predicted_svm) result_svm.append({"kernel" : kernel, "accuracy": f"{round(accuracy_svm*100,2)}%"}) print("Accuracy: %.2f%%" % round((accuracy_svm * 100.0),2)) print('######################################################################') accuracy_list.append(accuracy_svm) kernel_name = kernels[accuracy_list.index(max(accuracy_list))] SVM = svm.SVC(kernel=kernel_name).fit(self.X_train, self.y_train) predicted_svm = SVM.predict(self.X_test) accuracy_svm = accuracy_score(self.y_test, predicted_svm) print(f"Accuracy of SVM model {round(accuracy_svm,2)*100}%", '\n') print("########################################################################") print('best kernel is : ', kernel_name) print("########################################################################") print(classification_report(predicted_svm, self.y_test)) print("########################################################################") print("--- %s seconds ---" % (time.time() - start_time)) # Save the model joblib.dump(SVM, 'svm_model.pkl') def KNearetsNeighbor(self): start_time = time.time() Ks = 12 accuracy_knn = np.zeros((Ks-1)) std_acc = np.zeros((Ks-1)) for n in range(1,Ks): neigh = KNeighborsClassifier(n_neighbors = n).fit(self.X_train, self.y_train) yhat = neigh.predict(self.X_test) accuracy_knn[n-1] = metrics.accuracy_score(self.y_test, yhat) std_acc[n-1] = np.std(yhat==self.y_test) / np.sqrt(yhat.shape[0]) plt.figure(figsize=(10,6)) plt.plot(range(1,Ks), accuracy_knn, 'g') plt.fill_between(range(1,Ks), accuracy_knn - 1 * std_acc, accuracy_knn + 1 * std_acc, alpha=0.10) plt.fill_between(range(1,Ks), accuracy_knn - 3 * std_acc, accuracy_knn + 3 * std_acc, alpha=0.10, color="green") plt.legend(('Accuracy ', '+/- 1xstd', '+/- 3xstd')) plt.ylabel('Accuracy ') plt.xlabel('Number of Neighbors (K)') plt.tight_layout() # plt.show() knnc = KNeighborsClassifier() knnc_search = GridSearchCV(knnc, param_grid={'n_neighbors': [3, 5, 10], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']}, n_jobs=-1, cv=3, scoring='accuracy', verbose=2) knnc_search.fit(self.X_train, self.y_train) n_neighbors = knnc_search.best_params_['n_neighbors'] weights = knnc_search.best_params_['weights'] metric = knnc_search.best_params_['metric'] KNN = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric, weights=weights).fit(self.X_train, self.y_train) predicted_knn = KNN.predict(self.X_test) accuracy_knn = metrics.accuracy_score(self.y_test, predicted_knn) print(f"Accuracy of KNN model {round(accuracy_knn,2)*100}%", '\n') print("########################################################################") print(classification_report(predicted_knn, self.y_test)) print("########################################################################") print("--- %s seconds ---" % (time.time() - start_time)) # Save the model joblib.dump(KNN, 'knn_model.pkl') def DecisionTree(self): start_time = time.time() tree = DecisionTreeClassifier() dt_search = GridSearchCV(tree, param_grid={'criterion' : ['gini', 'entropy'], 'max_depth' : [2,3,4,5,6,7,8, 9, 10], 'max_leaf_nodes' : [2,3,4,5,6,7,8,9,10, 11]}, n_jobs=-1, cv=5, scoring='accuracy', verbose=2) dt_search.fit(self.X_train, self.y_train) criterion = dt_search.best_params_['criterion'] max_depth = dt_search.best_params_['max_depth'] max_leaf_nodes = dt_search.best_params_['max_leaf_nodes'] dtree = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, max_leaf_nodes=max_leaf_nodes).fit(self.X_train, self.y_train) predicted_dt = dtree.predict(self.X_test) accuracy_dt = metrics.accuracy_score(self.y_test, predicted_dt) print(f"criterion: {criterion}, max depth: {max_depth}, max_leaf: {max_leaf_nodes}") print(f"The Accuracy is : {round(accuracy_dt * 100,2)}%") print("########################################################################") print(classification_report(predicted_dt, self.y_test)) print("########################################################################") print("--- %s seconds ---" % (time.time() - start_time)) # Save the model joblib.dump(dtree, 'decision_tree_model.pkl') def RandomForest(self): start_time = time.time() RF = RandomForestClassifier(criterion='gini', n_estimators=500, min_samples_split=10, max_features='sqrt', oob_score=True, random_state=1, n_jobs=-1).fit(self.X_train, self.y_train) predicted_rf = RF.predict(self.X_test) svm_accuracy = accuracy_score(self.y_test, predicted_rf) print(f"Accuracy of RF is : {round(svm_accuracy*100,2)}%", '\n') print("########################################################################") print(classification_report(predicted_rf, self.y_test)) print("########################################################################") print("--- %s seconds ---" % (time.time() - start_time)) # Save the model joblib.dump(RF, 'random_forest_model.pkl') """ Decision Tree works Well Suppert Vector Machine works well Logistic Regression works well KNN works well Random Forest works well """ df = data.copy() df = df.dropna() X = df.drop(['dt','src','dst','label'], axis=1) y = df.label X = pd.get_dummies(X) M = Model(X) print(X) # Logistic Regression(Without FS) # M.LogisticRegression() # # Support Vector Machine(Without FS) # M.SupportVectorMachine() # # Decision Tree(Without FS) # M.DecisionTree() # # Random Forest Classification(Without FS) # M.RandomForest() # M.KNearetsNeighbor() df1 = data.copy() df1 = df1.dropna() df1.columns df1.info() important_features = [ 'src', 'pktcount', 'dst', 'byteperflow', 'pktperflow', 'pktrate', 'tot_kbps', 'rx_kbps', 'flows', 'bytecount', 'dt', 'Protocol', 'dur', 'tot_dur' ] weights = [ 17.87, 15.16, 13.64, 12.97, 11.35, 11.35, 9.68, 9.66, 8.95, 4.92, 2.33, 1.31, 1.11, 1.11 ] weighted_features = pd.DataFrame({'features':important_features, 'weights':weights}) weighted_features # print(weighted_features) X = df1[important_features] y = df1.label X = X.drop(['src', 'dst', 'dt'], axis=1) X.head() # print(X) X = pd.get_dummies(X) abs(X.corr()) fig, ax = plt.subplots(figsize=(10,7)) sns.heatmap(abs(X.corr()), annot=True) # ### There some duplicated features and high correlated features X = X.drop(['dur', "pktrate", "pktperflow"], axis=1) # X.columns fig, ax = plt.subplots(figsize=(10,7)) sns.heatmap(abs(X.corr()), annot=True) X = pd.get_dummies(X) M = Model(X) # print(X) # ## Logistic Regression(With FS) # M.LogisticRegression() # ## Support Vector Machine # M.SupportVectorMachine() # M.RandomForest() # M.DecisionTree() M.KNearetsNeighbor()