Final_Installer_Merged/ransomware-analysis-model .py
2024-10-25 11:19:11 +05:30

224 lines
6.0 KiB
Python

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For exampl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
INPUT_PATH = '/home/webncodes/Downloads/ransomWare/Ransomeware'
f1= open(INPUT_PATH + '/output.txt', encoding = 'utf-8')
# f1= open(INPUT_PATH + '/output.txt', encoding = 'utf-8')
# print(f1)
columns=[]
for i in f1:
column = i.split(';')
columns.append(column[1].replace('\n', ''))
# print(columns[0:10])
# exit(1)
# print("columns")
# # print(columns)
# print("Reading")
# data = pd.read_csv(INPUT_PATH + '/RansomwareData.csv', header=None,names=columns)
data = pd.read_csv(INPUT_PATH + '/combined_log_summary.csv', header=None,names=columns)
# data = pd.read_csv(INPUT_PATH + '/tra.csv', header=None,names=columns)
# print(data)
#seperate data to data_ransomware and data_goodware
print(data['Label (1 Ransomware / 0 Goodware)'])
data_ransomware = data.loc[(data['Label (1 Ransomware / 0 Goodware)'] == 1)]
data_goodware = data.loc[(data['Label (1 Ransomware / 0 Goodware)'] == 0)]
print(data_ransomware)
print("PK")
print(data_goodware)
# exit(1)
# In[20]:
#drop features that are all 0
data_ransomware = data_ransomware.loc[:, (data_ransomware != 0).any(axis=0)]
data_goodware = data_goodware.loc[:, (data_goodware != 0).any(axis=0)]
# In[24]:
#dictionary #some basic feature engineering done to understand and optimize ransomware model.
#feature -> total count
dic_ransomware = {}
for (columnName, columnData) in data_ransomware.items():
dic_ransomware[columnName] = columnData.sum()
# print(dic_ransomware['Ransomware Family'])
# exit(1)
del dic_ransomware['Ransomware Family']
del dic_ransomware['Label (1 Ransomware / 0 Goodware)']
dic_goodware = {}
for (columnName, columnData) in data_goodware.items():
dic_goodware[columnName] = columnData.sum()
# In[25]:
#sort by count, desc, all analysis done to better understand the data set
sorted_dic_ransomware = sorted(dic_ransomware.items(),key = lambda x:x[1],reverse = True)
sorted_dic_goodware = sorted(dic_goodware.items(),key = lambda x:x[1],reverse = True)
# In[26]:
# top 50 that ransomware do
sorted_dic_ransomware_top50 = sorted_dic_ransomware[0:51]
for var in sorted_dic_ransomware_top50:
print(var)
# In[27]:
# top 50 that goodmware do
sorted_dic_goodware_top50 = sorted_dic_goodware[0:50]
for var in sorted_dic_goodware_top50:
print(var)
# In[28]:
# diff, [ransomware do in top 50] but [goodmware not do in top 50]
set_diff = dict(sorted_dic_ransomware_top50).keys() - dict(sorted_dic_goodware_top50).keys()
print('in ransomware_top50 but not goodmware_top50: \n')
for var in set_diff:
print(var)
# In[29]:
# from percentage perspect
COUNT_GOODWARE = len(data_goodware)
COUNT_RANSOMWARE = len(data_ransomware)
# print(COUNT_GOODWARE)
# print(COUNT_RANSOMWARE)
# print(set_diff)
print(dic_goodware)
for var in set_diff:
print(f'feature {var}, ransomware count is {dic_ransomware[var]}, percentage is {dic_ransomware[var]/COUNT_RANSOMWARE}; goodware count is {dic_goodware[var]}, percentage is { dic_goodware[var]/COUNT_GOODWARE}')
# ### ransomware do more than goodware
# API:NtTerminateProcess 0.5120274914089347 -> 0.12845010615711253
# STR:15066 0.7663230240549829 -> 0.43842887473460723
# API:SetUnhandledExceptionFilter 0.6323024054982818 -> 0.321656050955414
#ransomware do but goodware not do
set_diff_ransomware_only = dic_ransomware.keys() - dic_goodware.keys()
len(set_diff_ransomware_only)
#goodware do but ransomware not do
set_diff_goodware_only = dic_goodware.keys() - dic_ransomware.keys()
len(set_diff_goodware_only)
# only ransomware do, top 50
i = 0
for var in sorted_dic_ransomware:
if i == 50:
break
if var[0] in set_diff_ransomware_only:
print(i, ": ", var[0], var[1])
i = i+1
# only goodware do, top 50
i = 0
for var in sorted_dic_goodware:
if i == 50:
break
if var[0] in set_diff_goodware_only:
print(i, ": ", var[0], var[1])
i = i+1
# In[9]:
#drop features that are all label and start the model training.
# data = data.loc[:, (data != 0).any(axis=0)]
X_data = data.drop('Label (1 Ransomware / 0 Goodware)', axis=1) # Features
X = X_data.drop('Ransomware Family', axis=1)
# X = X_data
# print(X)
y = data['Label (1 Ransomware / 0 Goodware)'] # Labels
print(X.head())
print(y.head())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Build the model
model = Sequential([
Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid') # Binary classification
])
# Compile the model
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
#save and use model
model.save('updated_ransomware_classifier.h5')
print("trainign complete")
loaded_model = tf.keras.models.load_model('updated_ransomware_classifier.h5')
print(X_test)
predictions = loaded_model.predict(X_test)
predicted_labels = (predictions > 0.5).astype(int)
true_labels = y_test.values
# Print the first few predictions and true labels
for i in range(10): # Adjust the range as needed
print(f"Sample {i}: Predicted = {predicted_labels[i][0]}, True = {true_labels[i]}")
# In[ ]: