from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from sklearn.metrics import confusion_matrix, classification_report import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only "../input/" directory # For exampl import pandas as pd import numpy as np import matplotlib.pyplot as plt INPUT_PATH = '/home/webncodes/Downloads/ransomWare/Ransomeware' f1= open(INPUT_PATH + '/output.txt', encoding = 'utf-8') # f1= open(INPUT_PATH + '/output.txt', encoding = 'utf-8') # print(f1) columns=[] for i in f1: column = i.split(';') columns.append(column[1].replace('\n', '')) # print(columns[0:10]) # exit(1) # print("columns") # # print(columns) # print("Reading") # data = pd.read_csv(INPUT_PATH + '/RansomwareData.csv', header=None,names=columns) data = pd.read_csv(INPUT_PATH + '/combined_log_summary.csv', header=None,names=columns) # data = pd.read_csv(INPUT_PATH + '/tra.csv', header=None,names=columns) # print(data) #seperate data to data_ransomware and data_goodware print(data['Label (1 Ransomware / 0 Goodware)']) data_ransomware = data.loc[(data['Label (1 Ransomware / 0 Goodware)'] == 1)] data_goodware = data.loc[(data['Label (1 Ransomware / 0 Goodware)'] == 0)] print(data_ransomware) print("PK") print(data_goodware) # exit(1) # In[20]: #drop features that are all 0 data_ransomware = data_ransomware.loc[:, (data_ransomware != 0).any(axis=0)] data_goodware = data_goodware.loc[:, (data_goodware != 0).any(axis=0)] # In[24]: #dictionary #some basic feature engineering done to understand and optimize ransomware model. #feature -> total count dic_ransomware = {} for (columnName, columnData) in data_ransomware.items(): dic_ransomware[columnName] = columnData.sum() # print(dic_ransomware['Ransomware Family']) # exit(1) del dic_ransomware['Ransomware Family'] del dic_ransomware['Label (1 Ransomware / 0 Goodware)'] dic_goodware = {} for (columnName, columnData) in data_goodware.items(): dic_goodware[columnName] = columnData.sum() # In[25]: #sort by count, desc, all analysis done to better understand the data set sorted_dic_ransomware = sorted(dic_ransomware.items(),key = lambda x:x[1],reverse = True) sorted_dic_goodware = sorted(dic_goodware.items(),key = lambda x:x[1],reverse = True) # In[26]: # top 50 that ransomware do sorted_dic_ransomware_top50 = sorted_dic_ransomware[0:51] for var in sorted_dic_ransomware_top50: print(var) # In[27]: # top 50 that goodmware do sorted_dic_goodware_top50 = sorted_dic_goodware[0:50] for var in sorted_dic_goodware_top50: print(var) # In[28]: # diff, [ransomware do in top 50] but [goodmware not do in top 50] set_diff = dict(sorted_dic_ransomware_top50).keys() - dict(sorted_dic_goodware_top50).keys() print('in ransomware_top50 but not goodmware_top50: \n') for var in set_diff: print(var) # In[29]: # from percentage perspect COUNT_GOODWARE = len(data_goodware) COUNT_RANSOMWARE = len(data_ransomware) # print(COUNT_GOODWARE) # print(COUNT_RANSOMWARE) # print(set_diff) print(dic_goodware) for var in set_diff: print(f'feature {var}, ransomware count is {dic_ransomware[var]}, percentage is {dic_ransomware[var]/COUNT_RANSOMWARE}; goodware count is {dic_goodware[var]}, percentage is { dic_goodware[var]/COUNT_GOODWARE}') # ### ransomware do more than goodware # API:NtTerminateProcess 0.5120274914089347 -> 0.12845010615711253 # STR:15066 0.7663230240549829 -> 0.43842887473460723 # API:SetUnhandledExceptionFilter 0.6323024054982818 -> 0.321656050955414 #ransomware do but goodware not do set_diff_ransomware_only = dic_ransomware.keys() - dic_goodware.keys() len(set_diff_ransomware_only) #goodware do but ransomware not do set_diff_goodware_only = dic_goodware.keys() - dic_ransomware.keys() len(set_diff_goodware_only) # only ransomware do, top 50 i = 0 for var in sorted_dic_ransomware: if i == 50: break if var[0] in set_diff_ransomware_only: print(i, ": ", var[0], var[1]) i = i+1 # only goodware do, top 50 i = 0 for var in sorted_dic_goodware: if i == 50: break if var[0] in set_diff_goodware_only: print(i, ": ", var[0], var[1]) i = i+1 # In[9]: #drop features that are all label and start the model training. # data = data.loc[:, (data != 0).any(axis=0)] X_data = data.drop('Label (1 Ransomware / 0 Goodware)', axis=1) # Features X = X_data.drop('Ransomware Family', axis=1) # X = X_data # print(X) y = data['Label (1 Ransomware / 0 Goodware)'] # Labels print(X.head()) print(y.head()) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Build the model model = Sequential([ Dense(64, activation='relu', input_shape=(X_train.shape[1],)), Dense(32, activation='relu'), Dense(1, activation='sigmoid') # Binary classification ]) # Compile the model # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy']) # Train the model model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1) # Evaluate the model loss, accuracy = model.evaluate(X_test, y_test) print(f"Test Accuracy: {accuracy:.2f}") #save and use model model.save('updated_ransomware_classifier.h5') print("trainign complete") loaded_model = tf.keras.models.load_model('updated_ransomware_classifier.h5') print(X_test) predictions = loaded_model.predict(X_test) predicted_labels = (predictions > 0.5).astype(int) true_labels = y_test.values # Print the first few predictions and true labels for i in range(10): # Adjust the range as needed print(f"Sample {i}: Predicted = {predicted_labels[i][0]}, True = {true_labels[i]}") # In[ ]: