DDoS_Detection_Ubuntu/DDOS_Model_Generation.py
2024-10-23 10:04:26 +05:30

511 lines
15 KiB
Python

#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
data = pd.read_csv('dataset_sdn.csv')
data.head()
data.shape
data.info()
##### Here we see that the label contains boolean values: 0 - Benign, 1-Maliciuous
data.label.unique()
data.label.value_counts()
label_dict = dict(data.label.value_counts())
sns.countplot(data.label)
labels = ["Maliciuous",'Benign']
sizes = [dict(data.label.value_counts())[0], dict(data.label.value_counts())[1]]
plt.figure(figsize = (13,8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
plt.legend(["Maliciuous", "Benign"])
plt.title('The percentage of Benign and Maliciuos Requests in dataset')
# plt.show()
data.describe()
# Let's look at the vizualisation of Null valued features
figure(figsize=(9, 5), dpi=80)
data[data.columns[data.isna().sum() >= 0]].isna().sum().sort_values().plot.bar()
plt.title("Features which has NuLL values")
data.isnull().sum()
numeric_df = data.select_dtypes(include=['int64', 'float64'])
object_df = data.select_dtypes(include=['object'])
numeric_cols = numeric_df.columns
object_cols = object_df.columns
print('Numeric Columns: ')
print(numeric_cols, '\n')
print('Object Columns: ')
print(object_cols, '\n')
print('Number of Numeric Features: ', len(numeric_cols))
print('Number of Object Features: ', len(object_cols))
# In[14]:
object_df.head()
# In[15]:
#### Let's look at Oblect columns (Source Destination Protocol)
figure(figsize=(12, 7), dpi=80)
plt.barh(list(dict(data.src.value_counts()).keys()), dict(data.src.value_counts()).values(), color='lawngreen')
for idx, val in enumerate(dict(data.src.value_counts()).values()):
plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13)
plt.xlabel('Number of Requests')
plt.ylabel('IP addres of sender')
plt.title('Number of all reqests')
# In[16]:
figure(figsize=(12, 7), dpi=80)
plt.barh(list(dict(data[data.label == 1].src.value_counts()).keys()), dict(data[data.label == 1].src.value_counts()).values(), color='blue')
for idx, val in enumerate(dict(data[data.label == 1].src.value_counts()).values()):
plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13)
plt.xlabel('Number of Requests')
plt.ylabel('IP addres of sender')
plt.title('Number of Attack requests')
# In[17]:
figure(figsize=(12, 7), dpi=80)
plt.barh(list(dict(data.src.value_counts()).keys()), dict(data.src.value_counts()).values(), color='lawngreen')
plt.barh(list(dict(data[data.label == 1].src.value_counts()).keys()), dict(data[data.label == 1].src.value_counts()).values(), color='blue')
for idx, val in enumerate(dict(data.src.value_counts()).values()):
plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13)
for idx, val in enumerate(dict(data[data.label == 1].src.value_counts()).values()):
plt.text(x = val, y = idx-0.2, s = str(val), color='w', size = 13)
plt.xlabel('Number of Requests')
plt.ylabel('IP addres of sender')
plt.legend(['All','malicious'])
plt.title('Number of requests from different IP adress')
# In[18]:
figure(figsize=(10, 6), dpi=80)
plt.bar(list(dict(data.Protocol.value_counts()).keys()), dict(data.Protocol.value_counts()).values(), color='r')
plt.bar(list(dict(data[data.label == 1].Protocol.value_counts()).keys()), dict(data[data.label == 1].Protocol.value_counts()).values(), color='b')
plt.text(x = 0 - 0.15, y = 41321 + 200, s = str(41321), color='black', size=17)
plt.text(x = 1 - 0.15, y = 33588 + 200, s = str(33588), color='black', size=17)
plt.text(x = 2 - 0.15, y = 29436 + 200, s = str(29436), color='black', size=17)
plt.text(x = 0 - 0.15, y = 9419 + 200, s = str(9419), color='w', size=17)
plt.text(x = 1 - 0.15, y = 17499 + 200, s = str(17499), color='w', size=17)
plt.text(x = 2 - 0.15, y = 13866 + 200, s = str(13866), color='w', size=17)
plt.xlabel('Protocol')
plt.ylabel('Count')
plt.legend(['All', 'malicious'])
plt.title('The number of requests from different protocols')
# In[19]:
df = data.copy()
# In[20]:
figure(figsize=(8, 4), dpi=80)
plt.hist(df.dur, bins=20, color='b')
plt.title('Duration')
# plt.show()
# In[21]:
figure(figsize=(8, 4), dpi=80)
plt.hist(df.tx_bytes, bins=20, color='r')
plt.title('TX_BYTES - Transmitted Bytes')
# plt.show()
# In[22]:
figure(figsize=(8, 4), dpi=80)
plt.hist(df.tx_kbps, bins=10, color='g')
plt.title('TX_KBPC')
# plt.show()
# In[23]:
plt.hist(df.switch, bins=20, color='r')
plt.title('SWITCH')
plt.xlabel('SWITCH')
# plt.show()
# In[24]:
plt.hist(df[df['label'] == 1].switch, bins=20, color='r')
plt.title('SWITCH')
plt.xlabel('SWITCH')
# plt.show()
import joblib
class Model:
global y
def __init__(self, data):
self.data = data
X = preprocessing.StandardScaler().fit(self.data).transform(self.data)
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=42, test_size=0.3)
def LogisticRegression(self):
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
start_time = time.time()
results_lr = []
accuracy_list = []
for solver in solvers:
LR = LogisticRegression(C=0.03, solver=solver).fit(self.X_train, self.y_train)
predicted_lr = LR.predict(self.X_test)
accuracy_lr = accuracy_score(self.y_test, predicted_lr)
results_lr.append({'solver' : solver, 'accuracy': str(round(accuracy_lr * 100, 2)) + "%",
'Coefficients': {'W' : LR.coef_, 'b': LR.intercept_}})
accuracy_list.append(accuracy_lr)
solver_name = solvers[accuracy_list.index(max(accuracy_list))]
LR = LogisticRegression(C=0.03, solver=solver_name).fit(self.X_train, self.y_train)
predicted_lr = LR.predict(self.X_test)
accuracy_lr = accuracy_score(self.y_test, predicted_lr)
print("Accuracy: %.2f%%" % (accuracy_lr * 100.0), '\n')
print("########################################################################")
print('Best solver is : ', solver_name)
print("########################################################################")
print(classification_report(predicted_lr, self.y_test), '\n')
print("########################################################################")
print("--- %s seconds --- time for LogisticRegression" % (time.time() - start_time))
# Save the model
joblib.dump(LR, 'logistic_regression_model.pkl')
def SupportVectorMachine(self):
start_time = time.time()
accuracy_list = []
result_svm = []
kernels = ['linear', 'poly','rbf', 'sigmoid']
for kernel in kernels:
SVM = svm.SVC(kernel=kernel).fit(self.X_train, self.y_train)
predicted_svm = SVM.predict(self.X_test)
accuracy_svm = accuracy_score(self.y_test, predicted_svm)
result_svm.append({"kernel" : kernel, "accuracy": f"{round(accuracy_svm*100,2)}%"})
print("Accuracy: %.2f%%" % round((accuracy_svm * 100.0),2))
print('######################################################################')
accuracy_list.append(accuracy_svm)
kernel_name = kernels[accuracy_list.index(max(accuracy_list))]
SVM = svm.SVC(kernel=kernel_name).fit(self.X_train, self.y_train)
predicted_svm = SVM.predict(self.X_test)
accuracy_svm = accuracy_score(self.y_test, predicted_svm)
print(f"Accuracy of SVM model {round(accuracy_svm,2)*100}%", '\n')
print("########################################################################")
print('best kernel is : ', kernel_name)
print("########################################################################")
print(classification_report(predicted_svm, self.y_test))
print("########################################################################")
print("--- %s seconds ---" % (time.time() - start_time))
# Save the model
joblib.dump(SVM, 'svm_model.pkl')
def KNearetsNeighbor(self):
start_time = time.time()
Ks = 12
accuracy_knn = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
for n in range(1,Ks):
neigh = KNeighborsClassifier(n_neighbors = n).fit(self.X_train, self.y_train)
yhat = neigh.predict(self.X_test)
accuracy_knn[n-1] = metrics.accuracy_score(self.y_test, yhat)
std_acc[n-1] = np.std(yhat==self.y_test) / np.sqrt(yhat.shape[0])
plt.figure(figsize=(10,6))
plt.plot(range(1,Ks), accuracy_knn, 'g')
plt.fill_between(range(1,Ks), accuracy_knn - 1 * std_acc, accuracy_knn + 1 * std_acc, alpha=0.10)
plt.fill_between(range(1,Ks), accuracy_knn - 3 * std_acc, accuracy_knn + 3 * std_acc, alpha=0.10, color="green")
plt.legend(('Accuracy ', '+/- 1xstd', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
# plt.show()
knnc = KNeighborsClassifier()
knnc_search = GridSearchCV(knnc, param_grid={'n_neighbors': [3, 5, 10],
'weights': ['uniform', 'distance'],
'metric': ['euclidean', 'manhattan']},
n_jobs=-1, cv=3, scoring='accuracy', verbose=2)
knnc_search.fit(self.X_train, self.y_train)
n_neighbors = knnc_search.best_params_['n_neighbors']
weights = knnc_search.best_params_['weights']
metric = knnc_search.best_params_['metric']
KNN = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric, weights=weights).fit(self.X_train, self.y_train)
predicted_knn = KNN.predict(self.X_test)
accuracy_knn = metrics.accuracy_score(self.y_test, predicted_knn)
print(f"Accuracy of KNN model {round(accuracy_knn,2)*100}%", '\n')
print("########################################################################")
print(classification_report(predicted_knn, self.y_test))
print("########################################################################")
print("--- %s seconds ---" % (time.time() - start_time))
# Save the model
joblib.dump(KNN, 'knn_model.pkl')
def DecisionTree(self):
start_time = time.time()
tree = DecisionTreeClassifier()
dt_search = GridSearchCV(tree, param_grid={'criterion' : ['gini', 'entropy'],
'max_depth' : [2,3,4,5,6,7,8, 9, 10],
'max_leaf_nodes' : [2,3,4,5,6,7,8,9,10, 11]},
n_jobs=-1, cv=5, scoring='accuracy', verbose=2)
dt_search.fit(self.X_train, self.y_train)
criterion = dt_search.best_params_['criterion']
max_depth = dt_search.best_params_['max_depth']
max_leaf_nodes = dt_search.best_params_['max_leaf_nodes']
dtree = DecisionTreeClassifier(criterion=criterion,
max_depth=max_depth,
max_leaf_nodes=max_leaf_nodes).fit(self.X_train, self.y_train)
predicted_dt = dtree.predict(self.X_test)
accuracy_dt = metrics.accuracy_score(self.y_test, predicted_dt)
print(f"criterion: {criterion}, max depth: {max_depth}, max_leaf: {max_leaf_nodes}")
print(f"The Accuracy is : {round(accuracy_dt * 100,2)}%")
print("########################################################################")
print(classification_report(predicted_dt, self.y_test))
print("########################################################################")
print("--- %s seconds ---" % (time.time() - start_time))
# Save the model
joblib.dump(dtree, 'decision_tree_model.pkl')
def RandomForest(self):
start_time = time.time()
RF = RandomForestClassifier(criterion='gini',
n_estimators=500,
min_samples_split=10,
max_features='sqrt',
oob_score=True,
random_state=1,
n_jobs=-1).fit(self.X_train, self.y_train)
predicted_rf = RF.predict(self.X_test)
svm_accuracy = accuracy_score(self.y_test, predicted_rf)
print(f"Accuracy of RF is : {round(svm_accuracy*100,2)}%", '\n')
print("########################################################################")
print(classification_report(predicted_rf, self.y_test))
print("########################################################################")
print("--- %s seconds ---" % (time.time() - start_time))
# Save the model
joblib.dump(RF, 'random_forest_model.pkl')
"""
Decision Tree works Well
Suppert Vector Machine works well
Logistic Regression works well
KNN works well
Random Forest works well
"""
df = data.copy()
df = df.dropna()
X = df.drop(['dt','src','dst','label'], axis=1)
y = df.label
X = pd.get_dummies(X)
M = Model(X)
print(X)
# Logistic Regression(Without FS)
# M.LogisticRegression()
# # Support Vector Machine(Without FS)
# M.SupportVectorMachine()
# # Decision Tree(Without FS)
# M.DecisionTree()
# # Random Forest Classification(Without FS)
# M.RandomForest()
# M.KNearetsNeighbor()
df1 = data.copy()
df1 = df1.dropna()
df1.columns
df1.info()
important_features = [
'src',
'pktcount',
'dst',
'byteperflow',
'pktperflow',
'pktrate',
'tot_kbps',
'rx_kbps',
'flows',
'bytecount',
'dt',
'Protocol',
'dur',
'tot_dur'
]
weights = [
17.87,
15.16,
13.64,
12.97,
11.35,
11.35,
9.68,
9.66,
8.95,
4.92,
2.33,
1.31,
1.11,
1.11
]
weighted_features = pd.DataFrame({'features':important_features,
'weights':weights})
weighted_features
# print(weighted_features)
X = df1[important_features]
y = df1.label
X = X.drop(['src', 'dst', 'dt'], axis=1)
X.head()
# print(X)
X = pd.get_dummies(X)
abs(X.corr())
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(abs(X.corr()), annot=True)
# ### There some duplicated features and high correlated features
X = X.drop(['dur', "pktrate", "pktperflow"], axis=1)
# X.columns
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(abs(X.corr()), annot=True)
X = pd.get_dummies(X)
M = Model(X)
# print(X)
# ## Logistic Regression(With FS)
# M.LogisticRegression()
# ## Support Vector Machine
# M.SupportVectorMachine()
# M.RandomForest()
# M.DecisionTree()
M.KNearetsNeighbor()