224 lines
6.0 KiB
Python
224 lines
6.0 KiB
Python
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler
|
|
import tensorflow as tf
|
|
from tensorflow.keras.models import Sequential
|
|
from tensorflow.keras.layers import Dense
|
|
|
|
from sklearn.metrics import confusion_matrix, classification_report
|
|
|
|
import numpy as np # linear algebra
|
|
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
|
|
|
|
# Input data files are available in the read-only "../input/" directory
|
|
# For exampl
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
INPUT_PATH = '/home/webncodes/Downloads/ransomWare/Ransomeware'
|
|
|
|
f1= open(INPUT_PATH + '/output.txt', encoding = 'utf-8')
|
|
# f1= open(INPUT_PATH + '/output.txt', encoding = 'utf-8')
|
|
# print(f1)
|
|
columns=[]
|
|
for i in f1:
|
|
column = i.split(';')
|
|
|
|
columns.append(column[1].replace('\n', ''))
|
|
|
|
# print(columns[0:10])
|
|
# exit(1)
|
|
# print("columns")
|
|
# # print(columns)
|
|
# print("Reading")
|
|
|
|
|
|
# data = pd.read_csv(INPUT_PATH + '/RansomwareData.csv', header=None,names=columns)
|
|
data = pd.read_csv(INPUT_PATH + '/combined_log_summary.csv', header=None,names=columns)
|
|
# data = pd.read_csv(INPUT_PATH + '/tra.csv', header=None,names=columns)
|
|
|
|
|
|
# print(data)
|
|
|
|
|
|
#seperate data to data_ransomware and data_goodware
|
|
|
|
print(data['Label (1 Ransomware / 0 Goodware)'])
|
|
|
|
data_ransomware = data.loc[(data['Label (1 Ransomware / 0 Goodware)'] == 1)]
|
|
data_goodware = data.loc[(data['Label (1 Ransomware / 0 Goodware)'] == 0)]
|
|
print(data_ransomware)
|
|
print("PK")
|
|
print(data_goodware)
|
|
# exit(1)
|
|
# In[20]:
|
|
|
|
|
|
#drop features that are all 0
|
|
data_ransomware = data_ransomware.loc[:, (data_ransomware != 0).any(axis=0)]
|
|
data_goodware = data_goodware.loc[:, (data_goodware != 0).any(axis=0)]
|
|
|
|
|
|
|
|
# In[24]:
|
|
|
|
|
|
#dictionary #some basic feature engineering done to understand and optimize ransomware model.
|
|
#feature -> total count
|
|
dic_ransomware = {}
|
|
for (columnName, columnData) in data_ransomware.items():
|
|
dic_ransomware[columnName] = columnData.sum()
|
|
|
|
# print(dic_ransomware['Ransomware Family'])
|
|
# exit(1)
|
|
del dic_ransomware['Ransomware Family']
|
|
del dic_ransomware['Label (1 Ransomware / 0 Goodware)']
|
|
|
|
dic_goodware = {}
|
|
for (columnName, columnData) in data_goodware.items():
|
|
dic_goodware[columnName] = columnData.sum()
|
|
|
|
|
|
# In[25]:
|
|
|
|
|
|
#sort by count, desc, all analysis done to better understand the data set
|
|
sorted_dic_ransomware = sorted(dic_ransomware.items(),key = lambda x:x[1],reverse = True)
|
|
sorted_dic_goodware = sorted(dic_goodware.items(),key = lambda x:x[1],reverse = True)
|
|
|
|
|
|
# In[26]:
|
|
|
|
|
|
# top 50 that ransomware do
|
|
sorted_dic_ransomware_top50 = sorted_dic_ransomware[0:51]
|
|
for var in sorted_dic_ransomware_top50:
|
|
print(var)
|
|
|
|
|
|
# In[27]:
|
|
|
|
|
|
# top 50 that goodmware do
|
|
sorted_dic_goodware_top50 = sorted_dic_goodware[0:50]
|
|
for var in sorted_dic_goodware_top50:
|
|
print(var)
|
|
|
|
|
|
# In[28]:
|
|
|
|
|
|
# diff, [ransomware do in top 50] but [goodmware not do in top 50]
|
|
set_diff = dict(sorted_dic_ransomware_top50).keys() - dict(sorted_dic_goodware_top50).keys()
|
|
print('in ransomware_top50 but not goodmware_top50: \n')
|
|
for var in set_diff:
|
|
print(var)
|
|
|
|
|
|
# In[29]:
|
|
|
|
|
|
# from percentage perspect
|
|
COUNT_GOODWARE = len(data_goodware)
|
|
COUNT_RANSOMWARE = len(data_ransomware)
|
|
# print(COUNT_GOODWARE)
|
|
# print(COUNT_RANSOMWARE)
|
|
# print(set_diff)
|
|
|
|
print(dic_goodware)
|
|
for var in set_diff:
|
|
print(f'feature {var}, ransomware count is {dic_ransomware[var]}, percentage is {dic_ransomware[var]/COUNT_RANSOMWARE}; goodware count is {dic_goodware[var]}, percentage is { dic_goodware[var]/COUNT_GOODWARE}')
|
|
|
|
|
|
# ### ransomware do more than goodware
|
|
# API:NtTerminateProcess 0.5120274914089347 -> 0.12845010615711253
|
|
# STR:15066 0.7663230240549829 -> 0.43842887473460723
|
|
# API:SetUnhandledExceptionFilter 0.6323024054982818 -> 0.321656050955414
|
|
|
|
|
|
#ransomware do but goodware not do
|
|
set_diff_ransomware_only = dic_ransomware.keys() - dic_goodware.keys()
|
|
len(set_diff_ransomware_only)
|
|
|
|
#goodware do but ransomware not do
|
|
set_diff_goodware_only = dic_goodware.keys() - dic_ransomware.keys()
|
|
len(set_diff_goodware_only)
|
|
|
|
# only ransomware do, top 50
|
|
i = 0
|
|
for var in sorted_dic_ransomware:
|
|
if i == 50:
|
|
break
|
|
if var[0] in set_diff_ransomware_only:
|
|
print(i, ": ", var[0], var[1])
|
|
i = i+1
|
|
|
|
# only goodware do, top 50
|
|
i = 0
|
|
for var in sorted_dic_goodware:
|
|
if i == 50:
|
|
break
|
|
if var[0] in set_diff_goodware_only:
|
|
print(i, ": ", var[0], var[1])
|
|
i = i+1
|
|
|
|
|
|
# In[9]:
|
|
|
|
|
|
#drop features that are all label and start the model training.
|
|
# data = data.loc[:, (data != 0).any(axis=0)]
|
|
X_data = data.drop('Label (1 Ransomware / 0 Goodware)', axis=1) # Features
|
|
X = X_data.drop('Ransomware Family', axis=1)
|
|
# X = X_data
|
|
# print(X)
|
|
y = data['Label (1 Ransomware / 0 Goodware)'] # Labels
|
|
|
|
print(X.head())
|
|
print(y.head())
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
scaler = StandardScaler()
|
|
X_train = scaler.fit_transform(X_train)
|
|
X_test = scaler.transform(X_test)
|
|
|
|
# Build the model
|
|
model = Sequential([
|
|
Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
|
|
Dense(32, activation='relu'),
|
|
Dense(1, activation='sigmoid') # Binary classification
|
|
])
|
|
|
|
# Compile the model
|
|
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
|
|
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
|
|
|
|
|
|
# Train the model
|
|
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)
|
|
|
|
# Evaluate the model
|
|
loss, accuracy = model.evaluate(X_test, y_test)
|
|
print(f"Test Accuracy: {accuracy:.2f}")
|
|
|
|
#save and use model
|
|
model.save('updated_ransomware_classifier.h5')
|
|
print("trainign complete")
|
|
loaded_model = tf.keras.models.load_model('updated_ransomware_classifier.h5')
|
|
print(X_test)
|
|
predictions = loaded_model.predict(X_test)
|
|
predicted_labels = (predictions > 0.5).astype(int)
|
|
true_labels = y_test.values
|
|
|
|
# Print the first few predictions and true labels
|
|
for i in range(10): # Adjust the range as needed
|
|
print(f"Sample {i}: Predicted = {predicted_labels[i][0]}, True = {true_labels[i]}")
|
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|