Final_Installer_Merged/ASM_Model_Generator.py
2024-10-25 11:19:11 +05:30

1111 lines
38 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#
# In[2]:
# Import necessary libraries
import warnings
import shutil
import IPython
import os
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.manifold import TSNE
from sklearn import preprocessing
from multiprocessing import Process, Pool
import multiprocessing
import codecs
import random as r
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import re
from nltk.util import ngrams
from sklearn.feature_selection import SelectKBest, chi2, f_regression
import scipy.sparse
import gc
import pickle as pkl
from datetime import datetime as dt
import dask.dataframe as dd
import matplotlib.pyplot as plt
# In[2]:
#separating byte files and asm files
source = 'train'
destination_1 = 'byteFiles'
destination_2 = 'asmFiles'
# # https://stackoverflow.com/a/29651514
def normalize(df):
result1 = df.copy()
for feature_name in df.columns:
if (str(feature_name) != str('Id') and str(feature_name)!=str('Class')):
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result1
def plot_confusion_matrix(test_y, predict_y):
C = confusion_matrix(test_y, predict_y)
print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100)
# C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j
A =(((C.T)/(C.sum(axis=1))).T)
B =(C/C.sum(axis=0))
labels = [1,2,3,4,5,6,7,8,9]
cmap=sns.light_palette("green")
# representing A in heatmap format
print("-"*50, "Confusion matrix", "-"*50)
plt.figure(figsize=(10,5))
sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("-"*50, "Precision matrix", "-"*50)
plt.figure(figsize=(10,5))
sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("Sum of columns in precision matrix",B.sum(axis=0))
# representing B in heatmap format
print("-"*50, "Recall matrix" , "-"*50)
plt.figure(figsize=(10,5))
sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("Sum of rows in precision matrix",A.sum(axis=1))
Y = pd.read_csv("trainLabels.csv")
#+++++++++++++++++++++++++++++++++++++++++++++++++++
import os
folder_1 ='first'
folder_2 ='second'
folder_3 ='third'
folder_4 ='fourth'
folder_5 ='fifth'
folder_6 = 'output'
for i in [folder_1,folder_2,folder_3,folder_4,folder_5,folder_6]:
if not os.path.isdir(i):
os.makedirs(i)
source='train/'
files = os.listdir('train')
data=range(0,len(files))
count=0
for i in range(0,len(files)):
if i % 5==0:
shutil.copy(source+files[data[i]],'first')
elif i%5==1:
shutil.copy(source+files[data[i]],'second')
elif i%5 ==2:
shutil.copy(source+files[data[i]],'third')
elif i%5 ==3:
shutil.copy(source+files[data[i]],'fourth')
elif i%5==4:
shutil.copy(source+files[data[i]],'fifth')
# In[24]:
# http://flint.cs.yale.edu/cs421/papers/x86-asm/asm.html
opcodefile = open("opcodes.txt", 'w+')
def firstprocess():
#The prefixes tells about the segments that are present in the asm files
#There are 450 segments(approx) present in all asm files.
#this prefixes are best segments that gives us best values.
#https://en.wikipedia.org/wiki/Data_segment
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
#this are opcodes that are used to get best results
#https://en.wikipedia.org/wiki/X86_instruction_listings
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
#best keywords that are taken from different blogs
keywords = ['.dll','std::',':dword']
#Below taken registers are general purpose registers and special registers
#All the registers which are taken are best
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
file1=open("asmsmallfile.txt","w+")
files = os.listdir('first')
for f in files:
#filling the values with zeros into the arrays
prefixescount=np.zeros(len(prefixes),dtype=int)
opcodescount=np.zeros(len(opcodes),dtype=int)
keywordcount=np.zeros(len(keywords),dtype=int)
registerscount=np.zeros(len(registers),dtype=int)
features=[]
f2=f.split('.')[0]
file1.write(f2+",")
opcodefile.write(f2+" ")
# https://docs.python.org/3/library/codecs.html#codecs.ignore_errors
# https://docs.python.org/3/library/codecs.html#codecs.Codec.encode
with codecs.open('first/'+f,encoding='cp1252',errors ='replace') as fli:
for lines in fli:
# https://www.tutorialspoint.com/python3/string_rstrip.htm
line=lines.rstrip().split()
if not line:
continue
l=line[0]
print("processing")
#counting the prefixs in each and every line
for i in range(len(prefixes)):
if prefixes[i] in line[0]:
prefixescount[i]+=1
line=line[1:]
#counting the opcodes in each and every line
for i in range(len(opcodes)):
if any(opcodes[i]==li for li in line):
features.append(opcodes[i])
opcodescount[i]+=1
#counting registers in the line
for i in range(len(registers)):
for li in line:
# we will use registers only in 'text' and 'CODE' segments
if registers[i] in li and ('text' in l or 'CODE' in l):
registerscount[i]+=1
#counting keywords in the line
for i in range(len(keywords)):
for li in line:
if keywords[i] in li:
keywordcount[i]+=1
#pushing the values into the file after reading whole file
for prefix in prefixescount:
file1.write(str(prefix)+",")
for opcode in opcodescount:
file1.write(str(opcode)+",")
for register in registerscount:
file1.write(str(register)+",")
for key in keywordcount:
file1.write(str(key)+",")
file1.write("\n")
file1.close()
#same as above
def secondprocess():
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
keywords = ['.dll','std::',':dword']
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
file1=open("mediumasmfile.txt","w+")
files = os.listdir('second')
for f in files:
prefixescount=np.zeros(len(prefixes),dtype=int)
opcodescount=np.zeros(len(opcodes),dtype=int)
keywordcount=np.zeros(len(keywords),dtype=int)
registerscount=np.zeros(len(registers),dtype=int)
features=[]
f2=f.split('.')[0]
file1.write(f2+",")
opcodefile.write(f2+" ")
with codecs.open('second/'+f,encoding='cp1252',errors ='replace') as fli:
for lines in fli:
line=lines.rstrip().split()
l=line[0]
for i in range(len(prefixes)):
if prefixes[i] in line[0]:
prefixescount[i]+=1
line=line[1:]
for i in range(len(opcodes)):
if any(opcodes[i]==li for li in line):
features.append(opcodes[i])
opcodescount[i]+=1
for i in range(len(registers)):
for li in line:
if registers[i] in li and ('text' in l or 'CODE' in l):
registerscount[i]+=1
for i in range(len(keywords)):
for li in line:
if keywords[i] in li:
keywordcount[i]+=1
for prefix in prefixescount:
file1.write(str(prefix)+",")
for opcode in opcodescount:
file1.write(str(opcode)+",")
for register in registerscount:
file1.write(str(register)+",")
for key in keywordcount:
file1.write(str(key)+",")
file1.write("\n")
file1.close()
# same as smallprocess() functions
def thirdprocess():
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
keywords = ['.dll','std::',':dword']
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
file1=open("largeasmfile.txt","w+")
files = os.listdir('third')
for f in files:
prefixescount=np.zeros(len(prefixes),dtype=int)
opcodescount=np.zeros(len(opcodes),dtype=int)
keywordcount=np.zeros(len(keywords),dtype=int)
registerscount=np.zeros(len(registers),dtype=int)
features=[]
f2=f.split('.')[0]
file1.write(f2+",")
opcodefile.write(f2+" ")
with codecs.open('third/'+f,encoding='cp1252',errors ='replace') as fli:
for lines in fli:
line=lines.rstrip().split()
l=line[0]
for i in range(len(prefixes)):
if prefixes[i] in line[0]:
prefixescount[i]+=1
line=line[1:]
for i in range(len(opcodes)):
if any(opcodes[i]==li for li in line):
features.append(opcodes[i])
opcodescount[i]+=1
for i in range(len(registers)):
for li in line:
if registers[i] in li and ('text' in l or 'CODE' in l):
registerscount[i]+=1
for i in range(len(keywords)):
for li in line:
if keywords[i] in li:
keywordcount[i]+=1
for prefix in prefixescount:
file1.write(str(prefix)+",")
for opcode in opcodescount:
file1.write(str(opcode)+",")
for register in registerscount:
file1.write(str(register)+",")
for key in keywordcount:
file1.write(str(key)+",")
file1.write("\n")
file1.close()
def fourthprocess():
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
keywords = ['.dll','std::',':dword']
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
file1=open("hugeasmfile.txt","w+")
files = os.listdir('fourth/')
for f in files:
prefixescount=np.zeros(len(prefixes),dtype=int)
opcodescount=np.zeros(len(opcodes),dtype=int)
keywordcount=np.zeros(len(keywords),dtype=int)
registerscount=np.zeros(len(registers),dtype=int)
features=[]
f2=f.split('.')[0]
file1.write(f2+",")
opcodefile.write(f2+" ")
with codecs.open('fourth/'+f,encoding='cp1252',errors ='replace') as fli:
for lines in fli:
line=lines.rstrip().split()
l=line[0]
for i in range(len(prefixes)):
if prefixes[i] in line[0]:
prefixescount[i]+=1
line=line[1:]
for i in range(len(opcodes)):
if any(opcodes[i]==li for li in line):
features.append(opcodes[i])
opcodescount[i]+=1
for i in range(len(registers)):
for li in line:
if registers[i] in li and ('text' in l or 'CODE' in l):
registerscount[i]+=1
for i in range(len(keywords)):
for li in line:
if keywords[i] in li:
keywordcount[i]+=1
for prefix in prefixescount:
file1.write(str(prefix)+",")
for opcode in opcodescount:
file1.write(str(opcode)+",")
for register in registerscount:
file1.write(str(register)+",")
for key in keywordcount:
file1.write(str(key)+",")
file1.write("\n")
file1.close()
def fifthprocess():
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
keywords = ['.dll','std::',':dword']
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
file1=open("trainasmfile.txt","w+")
files = os.listdir('fifth/')
for f in files:
prefixescount=np.zeros(len(prefixes),dtype=int)
opcodescount=np.zeros(len(opcodes),dtype=int)
keywordcount=np.zeros(len(keywords),dtype=int)
registerscount=np.zeros(len(registers),dtype=int)
features=[]
f2=f.split('.')[0]
file1.write(f2+",")
opcodefile.write(f2+" ")
with codecs.open('fifth/'+f,encoding='cp1252',errors ='replace') as fli:
for lines in fli:
line=lines.rstrip().split()
l=line[0]
for i in range(len(prefixes)):
if prefixes[i] in line[0]:
prefixescount[i]+=1
line=line[1:]
for i in range(len(opcodes)):
if any(opcodes[i]==li for li in line):
features.append(opcodes[i])
opcodescount[i]+=1
for i in range(len(registers)):
for li in line:
if registers[i] in li and ('text' in l or 'CODE' in l):
registerscount[i]+=1
for i in range(len(keywords)):
for li in line:
if keywords[i] in li:
keywordcount[i]+=1
for prefix in prefixescount:
file1.write(str(prefix)+",")
for opcode in opcodescount:
file1.write(str(opcode)+",")
for register in registerscount:
file1.write(str(register)+",")
for key in keywordcount:
file1.write(str(key)+",")
file1.write("\n")
file1.close()
def main():
#the below code is used for multiprogramming
#the number of process depends upon the number of cores present System
#process is used to call multiprogramming
manager=multiprocessing.Manager()
p1=Process(target=firstprocess)
p2=Process(target=secondprocess)
p3=Process(target=thirdprocess)
p4=Process(target=fourthprocess)
p5=Process(target=fifthprocess)
#p1.start() is used to start the thread execution
p1.start()
p2.start()
p3.start()
p4.start()
p5.start()
#After completion all the threads are joined
p1.join()
p2.join()
p3.join()
p4.join()
p5.join()
if __name__=="__main__":
main()
# Manually assign headers if not present
feature_headers = ['Id','.text:', '.Pav:', '.idata:', '.data:', '.bss:', '.rdata:', '.edata:', '.rsrc:', '.tls:', '.reloc:',
'.BSS:', '.CODE', 'jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec',
'add', 'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb', 'jz', 'rtn', 'lea',
'movzx','.dll','std::',':dword','edx', 'esi', 'eax', 'ebx', 'ecx', 'edi', 'ebp', 'esp', 'eip',",",'start'
]
# File names for merging
output_files = [
"asmsmallfile.txt", "mediumasmfile.txt", "largeasmfile.txt", "hugeasmfile.txt", "trainasmfile.txt"
]
df_list = []
for file in output_files:
df = pd.read_csv(file, header=None) # Load each file into a pandas DataFrame
df_list.append(df)
# Concatenate all DataFrames along axis 0 (rows)
merged_df = pd.concat(df_list, axis=0)
# Assign headers to the merged DataFrame
merged_df.columns = feature_headers
# Save to CSV with headers
merged_df.to_csv("asmoutputfile.csv", index=False, header=True)
#+++++++++++++++++++++++++++++++++++++++++++++++++++
# Verify the output
dfasm = pd.read_csv("asmoutputfile.csv")
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b> 22. Files sizes of each .asm file as a feature <a id="22"></a> </b></h1>
#
# #### [Back to the top](#0)
#
# In[ ]:
# file sizes of asm files
files=os.listdir('train')
filenames=Y['Id'].tolist()
class_y=Y['Class'].tolist()
class_bytes=[]
sizebytes=[]
fnames=[]
for file in files:
# print(os.stat('byteFiles/0A32eTdBKayjCWhZqDOQ.txt'))
# os.stat_result(st_mode=33206, st_ino=1125899906874507, st_dev=3561571700, st_nlink=1, st_uid=0, st_gid=0,
# st_size=3680109, st_atime=1519638522, st_mtime=1519638522, st_ctime=1519638522)
# read more about os.stat: here https://www.tutorialspoint.com/python/os_stat.htm
statinfo=os.stat('train/'+file)
# split the file name at '.' and take the first part of it i.e the file name
file=file.split('.')[0]
if any(file == filename for filename in filenames):
i=filenames.index(file)
class_bytes.append(class_y[i])
# converting into Mb's
sizebytes.append(statinfo.st_size/(1024.0*1024.0))
fnames.append(file)
asm_size_byte=pd.DataFrame({'Id':fnames,'size':sizebytes,'Class':class_bytes})
result_asm = asm_size_byte.fillna(0) # Replace NaN with 0
# <h4> 4.2.1.2 Distribution of .asm file sizes</h4>
# In[ ]:
#boxplot of asm files
ax = sns.boxplot(x="Class", y="size", data=asm_size_byte)
plt.title("boxplot of .bytes file sizes")
plt.show()
# ![Imgur](https://imgur.com/egYeXAJ.png)
# In[ ]:
result_asm = dfasm
result_asm = pd.merge(result_asm, asm_size_byte,on='Id', how='left')
result_asm.head()
# In[ ]:
# we normalize the data each column
result_asm = normalize(result_asm)
result_asm.head()
result_asm = result_asm.fillna(0) # Replace NaN with 0
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b> 23. Univariate analysis ONLY on .asm file features <a id="23"></a></b></h1>
#
# #### [Back to the top](#0)
#
# In[ ]:
ax = sns.boxplot(x="Class", y=".text:", data=result_asm)
plt.title("boxplot of .asm text segment")
plt.show()
#
# ![Imgur](https://imgur.com/5jWiNtY.png)
#
# <pre>
# The plot is between Text and class
# Class 1,2 and 9 can be easly separated
# </pre>
# In[ ]:
ax = sns.boxplot(x="Class", y=".Pav:", data=result_asm)
plt.title("boxplot of .asm pav segment")
plt.show()
# ![Imgur](https://imgur.com/clvpMB9.png)
# In[ ]:
ax = sns.boxplot(x="Class", y=".data:", data=result_asm)
plt.title("boxplot of .asm data segment")
plt.show()
# ![Imgur](https://imgur.com/CqJhugg.png)
#
# <pre>
# The plot is between data segment and class label
# class 6 and class 9 can be easily separated from given points
# </pre>
# In[ ]:
ax = sns.boxplot(x="Class", y=".bss:", data=result_asm)
plt.title("boxplot of .asm bss segment")
plt.show()
# ![Imgur](https://imgur.com/GKa73JO.png)
#
# <pre>
# plot between bss segment and class label
# very less number of files are having bss segment
# </pre>
# In[ ]:
result_asm = result_asm.dropna(subset=['.rdata:']) # Drop rows where '.rdata:' is NaN
ax = sns.boxplot(x="Class", y=".rdata:", data=result_asm)
plt.title("boxplot of .asm rdata segment")
plt.show()
#
# [Imgur](https://imgur.com/SPZxLJL.png)
#
# <pre>
# Plot between rdata segment and Class segment
# Class 2 can be easily separated 75 pecentile files are having 1M rdata lines
# </pre>
# In[ ]:
ax = sns.boxplot(x="Class", y="jmp", data=result_asm)
plt.title("boxplot of .asm jmp opcode")
plt.show()
# ![Imgur](https://imgur.com/0e0ylU2.png)
#
#
# <pre>
# plot between jmp and Class label
# Class 1 is having frequency of 2000 approx in 75 perentile of files
# </pre>
# In[ ]:
ax = sns.boxplot(x="Class", y="mov", data=result_asm)
plt.title("boxplot of .asm mov opcode")
plt.show()
#
# ![Imgur](https://imgur.com/Jr5dOJk.png)
#
#
# <pre>
# plot between Class label and mov opcode
# Class 1 is having frequency of 2000 approx in 75 perentile of files
# </pre>
# In[ ]:
ax = sns.boxplot(x="Class", y="retf", data=result_asm)
plt.title("boxplot of .asm retf opcode")
plt.show()
# ![Imgur](https://imgur.com/VQ25RTI.png)
#
#
# <pre>
# plot between Class label and retf
# Class 6 can be easily separated with opcode retf
# The frequency of retf is approx of 250.
# </pre>
# In[ ]:
ax = sns.boxplot(x="Class", y="push", data=result_asm)
plt.title("boxplot of .asm push opcode")
plt.show()
#
# ![Imgur](https://imgur.com/FLpSOdK.png)
#
# <pre>
# plot between push opcode and Class label
# Class 1 is having 75 precentile files with push opcodes of frequency 1000
# </pre>
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>24. Multivariate Analysis ONLY on .asm file features <a id="24"></a> </b></h1>
#
# #### [Back to the top](#0)
#
# In[ ]:
#multivariate analysis on asm files
#this is with perplexity 50
xtsne=TSNE(perplexity=50)
results=xtsne.fit_transform(result_asm.drop(['Id','Class'], axis=1).fillna(0))
data_y = result_asm['Class']
vis_x = results[:, 0]
vis_y = results[:, 1 ]
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
plt.colorbar(ticks=range(10))
plt.clim(0.5, 9)
plt.show()
# ![Imgur](https://imgur.com/tR4nhGB.png)
# In[ ]:
# by univariate analysis on the .asm file features we are getting very negligible information from
# 'rtn', '.BSS:' '.CODE' features, so heare we are trying multivariate analysis after removing those features
# the plot looks very messy
xtsne=TSNE(perplexity=30)
results=xtsne.fit_transform(result_asm.drop(['Id','Class', 'rtn', '.BSS:', '.CODE','size'], axis=1))
vis_x = results[:, 0]
vis_y = results[:, 1]
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
plt.colorbar(ticks=range(10))
plt.clim(0.5, 9)
plt.show()
# ![Imgur](https://imgur.com/3Fevxnl.png)
#
# <pre>
# TSNE for asm data with perplexity 50
# </pre>
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>25. Conclusion on EDA ( ONLY on .asm file features) <a id="25"></a> </b></h1>
#
# #### [Back to the top](#0)
#
# <p>
# <li>We have taken only 52 features from asm files (after reading through many blogs and research papers) </li>
# <li>The univariate analysis was done only on few important features.</li>
# <li>Take-aways
# <ul>
# <li>1. Class 3 can be easily separated because of the frequency of segments,opcodes and keywords being less </li>
# <li>2. Each feature has its unique importance in separating the Class labels.</li>
# </ul>
# </li>
# </p>
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>26. Train and test split ( ONLY on .asm file featues ) <a id="26"></a> </b></h1>
#
# #### [Back to the top](#0)
#
# In[ ]:
asm_y = result_asm['Class']
asm_x = result_asm.drop(['Id','Class','.BSS:','rtn','.CODE'], axis=1)
# In[ ]:
class_counts = asm_y.value_counts()
print(class_counts)
X_train_asm, X_test_asm, y_train_asm, y_test_asm = train_test_split(asm_x,asm_y ,stratify=asm_y,test_size=0.20)
X_train_asm, X_cv_asm, y_train_asm, y_cv_asm = train_test_split(X_train_asm, y_train_asm,stratify=y_train_asm,test_size=0.20)
# In[ ]:
print( X_cv_asm.isnull().all())
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>27. K-Nearest Neigbors ONLY on .asm file features <a id="27"></a> </b></h1>
#
# #### [Back to the top](#0)
#
# In[ ]:
# find more about KNeighborsClassifier() here http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
# -------------------------
# default parameter
# KNeighborsClassifier(n_neighbors=5, weights=uniform, algorithm=auto, leaf_size=30, p=2,
# metric=minkowski, metric_params=None, n_jobs=1, **kwargs)
# methods of
# fit(X, y) : Fit the model using X as training data and y as target values
# predict(X):Predict the class labels for the provided data
# predict_proba(X):Return probability estimates for the test data X.
# find more about CalibratedClassifierCV here at http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html
# ----------------------------
# default paramters
# sklearn.calibration.CalibratedClassifierCV(base_estimator=None, method=sigmoid, cv=3)
#
# some of the methods of CalibratedClassifierCV()
# fit(X, y[, sample_weight]) Fit the calibrated model
# get_params([deep]) Get parameters for this estimator.
# predict(X) Predict the target of new samples.
# predict_proba(X) Posterior probabilities of classification
alpha = [x for x in range(1, 21,2)]
cv_log_error_array=[]
for i in alpha:
k_cfl=KNeighborsClassifier(n_neighbors=i)
k_cfl.fit(X_train_asm,y_train_asm)
sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
sig_clf.fit(X_train_asm, y_train_asm)
predict_y = sig_clf.predict_proba(X_cv_asm)
cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=k_cfl.classes_, eps=1e-15))
for i in range(len(cv_log_error_array)):
print ('log_loss for k = ',alpha[i],'is',cv_log_error_array[i])
best_alpha = np.argmin(cv_log_error_array)
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
k_cfl=KNeighborsClassifier(n_neighbors=alpha[best_alpha])
k_cfl.fit(X_train_asm,y_train_asm)
sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
sig_clf.fit(X_train_asm, y_train_asm)
pred_y=sig_clf.predict(X_test_asm)
predict_y = sig_clf.predict_proba(X_train_asm)
print ('log loss for train data',log_loss(y_train_asm, predict_y))
predict_y = sig_clf.predict_proba(X_cv_asm)
print ('log loss for cv data',log_loss(y_cv_asm, predict_y))
predict_y = sig_clf.predict_proba(X_test_asm)
print ('log loss for test data',log_loss(y_test_asm, predict_y))
plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm))
with open('asm_models/KNeighborsClassifier.pkl', 'wb') as model_file:
pickle.dump(sig_clf, model_file)
# ![Imgur](https://imgur.com/xtCOdJi.png)
#
# ![Imgur](https://imgur.com/vTUky0K.png)
#
#
#
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>28. Logistic Regression ONLY on .asm file features <a id="28"></a> </b></h1>
#
#
# #### [Back to the top](#0)
#
#
# In[ ]:
# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# ------------------------------
# default parameters
# SGDClassifier(loss=hinge, penalty=l2, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=optimal, eta0=0.0, power_t=0.5,
# class_weight=None, warm_start=False, average=False, n_iter=None)
# some of methods
# fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent.
# predict(X) Predict class labels for samples in X.
alpha = [10 ** x for x in range(-5, 4)]
cv_log_error_array=[]
for i in alpha:
logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced')
logisticR.fit(X_train_asm,y_train_asm)
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
sig_clf.fit(X_train_asm, y_train_asm)
predict_y = sig_clf.predict_proba(X_cv_asm)
cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=logisticR.classes_, eps=1e-15))
for i in range(len(cv_log_error_array)):
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
best_alpha = np.argmin(cv_log_error_array)
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
logisticR=LogisticRegression(penalty='l2',C=alpha[best_alpha],class_weight='balanced')
logisticR.fit(X_train_asm,y_train_asm)
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
sig_clf.fit(X_train_asm, y_train_asm)
predict_y = sig_clf.predict_proba(X_train_asm)
print ('log loss for train data',(log_loss(y_train_asm, predict_y, labels=logisticR.classes_, eps=1e-15)))
predict_y = sig_clf.predict_proba(X_cv_asm)
print ('log loss for cv data',(log_loss(y_cv_asm, predict_y, labels=logisticR.classes_, eps=1e-15)))
predict_y = sig_clf.predict_proba(X_test_asm)
print ('log loss for test data',(log_loss(y_test_asm, predict_y, labels=logisticR.classes_, eps=1e-15)))
plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm))
with open('asm_models/LogisticRegression.pkl', 'wb') as model_file:
pickle.dump(sig_clf, model_file)
# ![Imgur](https://imgur.com/8uIh7cZ.png)
#
#
# ![Imgur](https://imgur.com/wV4w7Er.png)
#
#
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>29. Random Forest Classifier ONLY on .asm file features <a id="29"></a> </b></h1>
#
# #### [Back to the top](#0)
#
# In[ ]:
# --------------------------------
# default parameters
# sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=gini, max_depth=None, min_samples_split=2,
# min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=auto, max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False,
# class_weight=None)
# Some of methods of RandomForestClassifier()
# fit(X, y, [sample_weight]) Fit the SVM model according to the given training data.
# predict(X) Perform classification on samples in X.
# predict_proba (X) Perform classification on samples in X.
# some of attributes of RandomForestClassifier()
# feature_importances_ : array of shape = [n_features]
# The feature importances (the higher, the more important the feature).
alpha=[10,50,100,500,1000,2000,3000]
cv_log_error_array=[]
for i in alpha:
r_cfl=RandomForestClassifier(n_estimators=i,random_state=42,n_jobs=-1)
r_cfl.fit(X_train_asm,y_train_asm)
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
sig_clf.fit(X_train_asm, y_train_asm)
predict_y = sig_clf.predict_proba(X_cv_asm)
cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=r_cfl.classes_, eps=1e-15))
for i in range(len(cv_log_error_array)):
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
best_alpha = np.argmin(cv_log_error_array)
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
r_cfl=RandomForestClassifier(n_estimators=alpha[best_alpha],random_state=42,n_jobs=-1)
r_cfl.fit(X_train_asm,y_train_asm)
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
sig_clf.fit(X_train_asm, y_train_asm)
predict_y = sig_clf.predict_proba(X_train_asm)
print ('log loss for train data',(log_loss(y_train_asm, predict_y, labels=sig_clf.classes_, eps=1e-15)))
predict_y = sig_clf.predict_proba(X_cv_asm)
print ('log loss for cv data',(log_loss(y_cv_asm, predict_y, labels=sig_clf.classes_, eps=1e-15)))
predict_y = sig_clf.predict_proba(X_test_asm)
print ('log loss for test data',(log_loss(y_test_asm, predict_y, labels=sig_clf.classes_, eps=1e-15)))
plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm))
with open('asm_models/RandomForestClassifier.pkl', 'wb') as model_file:
pickle.dump(sig_clf, model_file)
# ![Imgur](https://imgur.com/C431Dn7.png)
#
# ![Imgur](https://imgur.com/RwZwWtJ.png)
#
#
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>30. XgBoost Classifier ONLY on .asm file features <a id="30"></a> </b></h1>
#
# #### [Back to the top](#0)
#
# In[ ]:
# Training a hyper-parameter tuned Xg-Boost regressor on our train data
# find more about XGBClassifier function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#xgboost.XGBClassifier
# -------------------------
# default paramters
# class xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True,
# objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
# max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1,
# scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs)
# some of methods of RandomForestRegressor()
# fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None)
# get_params([deep]) Get parameters for this estimator.
# predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe.
# get_score(importance_type='weight') -> get the feature importance
alpha=[10,50,100,500,1000,2000,3000]
cv_log_error_array=[]
for i in alpha:
x_cfl=XGBClassifier(n_estimators=i,nthread=-1)
x_cfl.fit(X_train_asm,y_train_asm)
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
sig_clf.fit(X_train_asm, y_train_asm)
predict_y = sig_clf.predict_proba(X_cv_asm)
cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=x_cfl.classes_, eps=1e-15))
for i in range(len(cv_log_error_array)):
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
best_alpha = np.argmin(cv_log_error_array)
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
x_cfl=XGBClassifier(n_estimators=alpha[best_alpha],nthread=-1)
x_cfl.fit(X_train_asm,y_train_asm)
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
sig_clf.fit(X_train_asm, y_train_asm)
predict_y = sig_clf.predict_proba(X_train_asm)
print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train_asm, predict_y))
predict_y = sig_clf.predict_proba(X_cv_asm)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv_asm, predict_y))
predict_y = sig_clf.predict_proba(X_test_asm)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test_asm, predict_y))
plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm))
with open('asm_models/XGBClassifier.pkl', 'wb') as model_file:
pickle.dump(sig_clf, model_file)
# ![Imgur](https://imgur.com/JMb1GDQ.png)
#
#
# ![Imgur](https://imgur.com/mp296Le.png)
#
#
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>31. Xgboost Classifier with best hyperparameters ( ONLY on .asm file features ) <a id="31"></a></b></h1>
#
# #### [Back to the top](#0)
#
# In[ ]:
x_cfl=XGBClassifier()
prams={
'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2],
'n_estimators':[100,200,500,1000,2000],
'max_depth':[3,5,10],
'colsample_bytree':[0.1,0.3,0.5,1],
'subsample':[0.1,0.3,0.5,1]
}
random_cfl=RandomizedSearchCV(x_cfl,param_distributions=prams,verbose=10,n_jobs=-1,)
random_cfl.fit(X_train_asm,y_train_asm)
# In[ ]:
print (random_cfl.best_params_)
# In[ ]:
# Training a hyper-parameter tuned Xg-Boost regressor on our train data
# find more about XGBClassifier function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#xgboost.XGBClassifier
# -------------------------
# default paramters
# class xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True,
# objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
# max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1,
# scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs)
# some of methods of RandomForestRegressor()
# fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None)
# get_params([deep]) Get parameters for this estimator.
# predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe.
# get_score(importance_type='weight') -> get the feature importance
x_cfl=XGBClassifier(n_estimators=200,subsample=0.5,learning_rate=0.15,colsample_bytree=0.5,max_depth=3)
x_cfl.fit(X_train_asm,y_train_asm)
c_cfl=CalibratedClassifierCV(x_cfl,method='sigmoid')
c_cfl.fit(X_train_asm,y_train_asm)
predict_y = c_cfl.predict_proba(X_train_asm)
print ('train loss',log_loss(y_train_asm, predict_y))
predict_y = c_cfl.predict_proba(X_cv_asm)
print ('cv loss',log_loss(y_cv_asm, predict_y))
predict_y = c_cfl.predict_proba(X_test_asm)
print ('test loss',log_loss(y_test_asm, predict_y))