1119 lines
38 KiB
Python
1119 lines
38 KiB
Python
#
|
||
|
||
|
||
# In[2]:
|
||
|
||
# Import necessary libraries
|
||
import warnings
|
||
import shutil
|
||
import IPython
|
||
import os
|
||
import pandas as pd
|
||
import matplotlib
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
import numpy as np
|
||
from tqdm import tqdm
|
||
import pickle
|
||
from sklearn.manifold import TSNE
|
||
from sklearn import preprocessing
|
||
from multiprocessing import Process, Pool
|
||
import multiprocessing
|
||
import codecs
|
||
import random as r
|
||
from xgboost import XGBClassifier
|
||
from sklearn.model_selection import RandomizedSearchCV, train_test_split
|
||
from sklearn.tree import DecisionTreeClassifier
|
||
from sklearn.calibration import CalibratedClassifierCV
|
||
from sklearn.neighbors import KNeighborsClassifier
|
||
from sklearn.metrics import log_loss, confusion_matrix
|
||
from sklearn.linear_model import LogisticRegression
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
import re
|
||
from nltk.util import ngrams
|
||
from sklearn.feature_selection import SelectKBest, chi2, f_regression
|
||
import scipy.sparse
|
||
import gc
|
||
import pickle as pkl
|
||
from datetime import datetime as dt
|
||
import dask.dataframe as dd
|
||
import matplotlib.pyplot as plt
|
||
|
||
|
||
# In[2]:
|
||
|
||
|
||
#separating byte files and asm files
|
||
|
||
source = 'train'
|
||
destination_1 = 'byteFiles'
|
||
destination_2 = 'asmFiles'
|
||
|
||
# # https://stackoverflow.com/a/29651514
|
||
def normalize(df):
|
||
result1 = df.copy()
|
||
for feature_name in df.columns:
|
||
if (str(feature_name) != str('Id') and str(feature_name)!=str('Class')):
|
||
max_value = df[feature_name].max()
|
||
min_value = df[feature_name].min()
|
||
|
||
result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
|
||
return result1
|
||
|
||
|
||
def plot_confusion_matrix(test_y, predict_y):
|
||
C = confusion_matrix(test_y, predict_y)
|
||
print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100)
|
||
# C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j
|
||
|
||
A =(((C.T)/(C.sum(axis=1))).T)
|
||
B =(C/C.sum(axis=0))
|
||
labels = [1,2,3,4,5,6,7,8,9]
|
||
cmap=sns.light_palette("green")
|
||
# representing A in heatmap format
|
||
print("-"*50, "Confusion matrix", "-"*50)
|
||
plt.figure(figsize=(10,5))
|
||
sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
|
||
plt.xlabel('Predicted Class')
|
||
plt.ylabel('Original Class')
|
||
plt.show()
|
||
|
||
print("-"*50, "Precision matrix", "-"*50)
|
||
plt.figure(figsize=(10,5))
|
||
sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
|
||
plt.xlabel('Predicted Class')
|
||
plt.ylabel('Original Class')
|
||
plt.show()
|
||
print("Sum of columns in precision matrix",B.sum(axis=0))
|
||
|
||
# representing B in heatmap format
|
||
print("-"*50, "Recall matrix" , "-"*50)
|
||
plt.figure(figsize=(10,5))
|
||
sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
|
||
plt.xlabel('Predicted Class')
|
||
plt.ylabel('Original Class')
|
||
plt.show()
|
||
print("Sum of rows in precision matrix",A.sum(axis=1))
|
||
|
||
|
||
|
||
|
||
Y = pd.read_csv("trainLabels.csv")
|
||
#+++++++++++++++++++++++++++++++++++++++++++++++++++
|
||
import os
|
||
folder_1 ='first'
|
||
folder_2 ='second'
|
||
folder_3 ='third'
|
||
folder_4 ='fourth'
|
||
folder_5 ='fifth'
|
||
folder_6 = 'output'
|
||
for i in [folder_1,folder_2,folder_3,folder_4,folder_5,folder_6]:
|
||
if not os.path.isdir(i):
|
||
os.makedirs(i)
|
||
|
||
source='train/'
|
||
files = os.listdir('train')
|
||
|
||
data=range(0,len(files))
|
||
|
||
|
||
count=0
|
||
for i in range(0,len(files)):
|
||
if i % 5==0:
|
||
shutil.copy(source+files[data[i]],'first')
|
||
elif i%5==1:
|
||
shutil.copy(source+files[data[i]],'second')
|
||
elif i%5 ==2:
|
||
shutil.copy(source+files[data[i]],'third')
|
||
elif i%5 ==3:
|
||
shutil.copy(source+files[data[i]],'fourth')
|
||
elif i%5==4:
|
||
shutil.copy(source+files[data[i]],'fifth')
|
||
|
||
|
||
# In[24]:
|
||
|
||
|
||
# http://flint.cs.yale.edu/cs421/papers/x86-asm/asm.html
|
||
|
||
opcodefile = open("opcodes.txt", 'w+')
|
||
def firstprocess():
|
||
#The prefixes tells about the segments that are present in the asm files
|
||
#There are 450 segments(approx) present in all asm files.
|
||
#this prefixes are best segments that gives us best values.
|
||
#https://en.wikipedia.org/wiki/Data_segment
|
||
|
||
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
|
||
#this are opcodes that are used to get best results
|
||
#https://en.wikipedia.org/wiki/X86_instruction_listings
|
||
|
||
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
|
||
#best keywords that are taken from different blogs
|
||
keywords = ['.dll','std::',':dword']
|
||
#Below taken registers are general purpose registers and special registers
|
||
#All the registers which are taken are best
|
||
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
|
||
|
||
file1=open("asmsmallfile.txt","w+")
|
||
files = os.listdir('first')
|
||
for f in files:
|
||
#filling the values with zeros into the arrays
|
||
prefixescount=np.zeros(len(prefixes),dtype=int)
|
||
opcodescount=np.zeros(len(opcodes),dtype=int)
|
||
keywordcount=np.zeros(len(keywords),dtype=int)
|
||
registerscount=np.zeros(len(registers),dtype=int)
|
||
features=[]
|
||
f2=f.split('.')[0]
|
||
file1.write(f2+",")
|
||
|
||
opcodefile.write(f2+" ")
|
||
# https://docs.python.org/3/library/codecs.html#codecs.ignore_errors
|
||
# https://docs.python.org/3/library/codecs.html#codecs.Codec.encode
|
||
with codecs.open('first/'+f,encoding='cp1252',errors ='replace') as fli:
|
||
for lines in fli:
|
||
|
||
# https://www.tutorialspoint.com/python3/string_rstrip.htm
|
||
line=lines.rstrip().split()
|
||
if not line:
|
||
|
||
continue
|
||
l=line[0]
|
||
print("processing")
|
||
#counting the prefixs in each and every line
|
||
for i in range(len(prefixes)):
|
||
if prefixes[i] in line[0]:
|
||
prefixescount[i]+=1
|
||
line=line[1:]
|
||
#counting the opcodes in each and every line
|
||
for i in range(len(opcodes)):
|
||
if any(opcodes[i]==li for li in line):
|
||
features.append(opcodes[i])
|
||
opcodescount[i]+=1
|
||
#counting registers in the line
|
||
for i in range(len(registers)):
|
||
for li in line:
|
||
# we will use registers only in 'text' and 'CODE' segments
|
||
if registers[i] in li and ('text' in l or 'CODE' in l):
|
||
registerscount[i]+=1
|
||
#counting keywords in the line
|
||
for i in range(len(keywords)):
|
||
for li in line:
|
||
if keywords[i] in li:
|
||
keywordcount[i]+=1
|
||
#pushing the values into the file after reading whole file
|
||
for prefix in prefixescount:
|
||
file1.write(str(prefix)+",")
|
||
for opcode in opcodescount:
|
||
file1.write(str(opcode)+",")
|
||
for register in registerscount:
|
||
file1.write(str(register)+",")
|
||
for key in keywordcount:
|
||
file1.write(str(key)+",")
|
||
file1.write("\n")
|
||
file1.close()
|
||
|
||
|
||
#same as above
|
||
def secondprocess():
|
||
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
|
||
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
|
||
keywords = ['.dll','std::',':dword']
|
||
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
|
||
|
||
file1=open("mediumasmfile.txt","w+")
|
||
files = os.listdir('second')
|
||
for f in files:
|
||
prefixescount=np.zeros(len(prefixes),dtype=int)
|
||
opcodescount=np.zeros(len(opcodes),dtype=int)
|
||
keywordcount=np.zeros(len(keywords),dtype=int)
|
||
registerscount=np.zeros(len(registers),dtype=int)
|
||
features=[]
|
||
f2=f.split('.')[0]
|
||
file1.write(f2+",")
|
||
opcodefile.write(f2+" ")
|
||
with codecs.open('second/'+f,encoding='cp1252',errors ='replace') as fli:
|
||
for lines in fli:
|
||
line=lines.rstrip().split()
|
||
l=line[0]
|
||
for i in range(len(prefixes)):
|
||
if prefixes[i] in line[0]:
|
||
prefixescount[i]+=1
|
||
line=line[1:]
|
||
for i in range(len(opcodes)):
|
||
if any(opcodes[i]==li for li in line):
|
||
features.append(opcodes[i])
|
||
opcodescount[i]+=1
|
||
for i in range(len(registers)):
|
||
for li in line:
|
||
if registers[i] in li and ('text' in l or 'CODE' in l):
|
||
registerscount[i]+=1
|
||
for i in range(len(keywords)):
|
||
for li in line:
|
||
if keywords[i] in li:
|
||
keywordcount[i]+=1
|
||
for prefix in prefixescount:
|
||
file1.write(str(prefix)+",")
|
||
for opcode in opcodescount:
|
||
file1.write(str(opcode)+",")
|
||
for register in registerscount:
|
||
file1.write(str(register)+",")
|
||
for key in keywordcount:
|
||
file1.write(str(key)+",")
|
||
file1.write("\n")
|
||
file1.close()
|
||
|
||
# same as smallprocess() functions
|
||
def thirdprocess():
|
||
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
|
||
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
|
||
keywords = ['.dll','std::',':dword']
|
||
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
|
||
|
||
file1=open("large_asmfile.txt","w+")
|
||
files = os.listdir('third')
|
||
for f in files:
|
||
prefixescount=np.zeros(len(prefixes),dtype=int)
|
||
opcodescount=np.zeros(len(opcodes),dtype=int)
|
||
keywordcount=np.zeros(len(keywords),dtype=int)
|
||
registerscount=np.zeros(len(registers),dtype=int)
|
||
features=[]
|
||
f2=f.split('.')[0]
|
||
file1.write(f2+",")
|
||
opcodefile.write(f2+" ")
|
||
with codecs.open('third/'+f,encoding='cp1252',errors ='replace') as fli:
|
||
for lines in fli:
|
||
line=lines.rstrip().split()
|
||
# this if is to prevent, Prevention of flow due to Empty Lines (IndexError: list index out of range)
|
||
if len(line) > 0:
|
||
l=line[0]
|
||
for i in range(len(prefixes)):
|
||
if prefixes[i] in line[0]:
|
||
prefixescount[i]+=1
|
||
line=line[1:]
|
||
for i in range(len(opcodes)):
|
||
if any(opcodes[i]==li for li in line):
|
||
features.append(opcodes[i])
|
||
opcodescount[i]+=1
|
||
for i in range(len(registers)):
|
||
for li in line:
|
||
if registers[i] in li and ('text' in l or 'CODE' in l):
|
||
registerscount[i]+=1
|
||
for i in range(len(keywords)):
|
||
for li in line:
|
||
if keywords[i] in li:
|
||
keywordcount[i]+=1
|
||
for prefix in prefixescount:
|
||
file1.write(str(prefix)+",")
|
||
for opcode in opcodescount:
|
||
file1.write(str(opcode)+",")
|
||
for register in registerscount:
|
||
file1.write(str(register)+",")
|
||
for key in keywordcount:
|
||
file1.write(str(key)+",")
|
||
file1.write("\n")
|
||
file1.close()
|
||
|
||
|
||
def fourthprocess():
|
||
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
|
||
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
|
||
keywords = ['.dll','std::',':dword']
|
||
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
|
||
file1=open("hugeasmfile.txt","w+")
|
||
files = os.listdir('fourth/')
|
||
for f in files:
|
||
prefixescount=np.zeros(len(prefixes),dtype=int)
|
||
opcodescount=np.zeros(len(opcodes),dtype=int)
|
||
keywordcount=np.zeros(len(keywords),dtype=int)
|
||
registerscount=np.zeros(len(registers),dtype=int)
|
||
features=[]
|
||
f2=f.split('.')[0]
|
||
file1.write(f2+",")
|
||
opcodefile.write(f2+" ")
|
||
with codecs.open('fourth/'+f,encoding='cp1252',errors ='replace') as fli:
|
||
for lines in fli:
|
||
line=lines.rstrip().split()
|
||
l=line[0]
|
||
for i in range(len(prefixes)):
|
||
if prefixes[i] in line[0]:
|
||
prefixescount[i]+=1
|
||
line=line[1:]
|
||
for i in range(len(opcodes)):
|
||
if any(opcodes[i]==li for li in line):
|
||
features.append(opcodes[i])
|
||
opcodescount[i]+=1
|
||
for i in range(len(registers)):
|
||
for li in line:
|
||
if registers[i] in li and ('text' in l or 'CODE' in l):
|
||
registerscount[i]+=1
|
||
for i in range(len(keywords)):
|
||
for li in line:
|
||
if keywords[i] in li:
|
||
keywordcount[i]+=1
|
||
for prefix in prefixescount:
|
||
file1.write(str(prefix)+",")
|
||
for opcode in opcodescount:
|
||
file1.write(str(opcode)+",")
|
||
for register in registerscount:
|
||
file1.write(str(register)+",")
|
||
for key in keywordcount:
|
||
file1.write(str(key)+",")
|
||
file1.write("\n")
|
||
file1.close()
|
||
|
||
|
||
def fifthprocess():
|
||
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
|
||
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
|
||
keywords = ['.dll','std::',':dword']
|
||
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
|
||
file1=open("trainasmfile.txt","w+")
|
||
files = os.listdir('fifth/')
|
||
for f in files:
|
||
prefixescount=np.zeros(len(prefixes),dtype=int)
|
||
opcodescount=np.zeros(len(opcodes),dtype=int)
|
||
keywordcount=np.zeros(len(keywords),dtype=int)
|
||
registerscount=np.zeros(len(registers),dtype=int)
|
||
features=[]
|
||
f2=f.split('.')[0]
|
||
file1.write(f2+",")
|
||
opcodefile.write(f2+" ")
|
||
with codecs.open('fifth/'+f,encoding='cp1252',errors ='replace') as fli:
|
||
for lines in fli:
|
||
line=lines.rstrip().split()
|
||
l=line[0]
|
||
for i in range(len(prefixes)):
|
||
if prefixes[i] in line[0]:
|
||
prefixescount[i]+=1
|
||
line=line[1:]
|
||
for i in range(len(opcodes)):
|
||
if any(opcodes[i]==li for li in line):
|
||
features.append(opcodes[i])
|
||
opcodescount[i]+=1
|
||
for i in range(len(registers)):
|
||
for li in line:
|
||
if registers[i] in li and ('text' in l or 'CODE' in l):
|
||
registerscount[i]+=1
|
||
for i in range(len(keywords)):
|
||
for li in line:
|
||
if keywords[i] in li:
|
||
keywordcount[i]+=1
|
||
for prefix in prefixescount:
|
||
file1.write(str(prefix)+",")
|
||
for opcode in opcodescount:
|
||
file1.write(str(opcode)+",")
|
||
for register in registerscount:
|
||
file1.write(str(register)+",")
|
||
for key in keywordcount:
|
||
file1.write(str(key)+",")
|
||
file1.write("\n")
|
||
file1.close()
|
||
|
||
|
||
# def main():
|
||
# #the below code is used for multiprogramming
|
||
# #the number of process depends upon the number of cores present System
|
||
# #process is used to call multiprogramming
|
||
# manager=multiprocessing.Manager()
|
||
# p1=Process(target=firstprocess)
|
||
# p2=Process(target=secondprocess)
|
||
# p3=Process(target=thirdprocess)
|
||
# p4=Process(target=fourthprocess)
|
||
# p5=Process(target=fifthprocess)
|
||
# #p1.start() is used to start the thread execution
|
||
# p1.start()
|
||
# p2.start()
|
||
# p3.start()
|
||
# p4.start()
|
||
# p5.start()
|
||
# #After completion all the threads are joined
|
||
# p1.join()
|
||
# p2.join()
|
||
# p3.join()
|
||
# p4.join()
|
||
# p5.join()
|
||
|
||
|
||
|
||
if __name__=="__main__":
|
||
thirdprocess()
|
||
# main()
|
||
# Manually assign headers if not present
|
||
feature_headers = ['Id','.text:', '.Pav:', '.idata:', '.data:', '.bss:', '.rdata:', '.edata:', '.rsrc:', '.tls:', '.reloc:',
|
||
'.BSS:', '.CODE', 'jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec',
|
||
'add', 'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb', 'jz', 'rtn', 'lea',
|
||
'movzx','.dll','std::',':dword','edx', 'esi', 'eax', 'ebx', 'ecx', 'edi', 'ebp', 'esp', 'eip',",",'start'
|
||
]
|
||
|
||
|
||
|
||
# File names for merging
|
||
output_files = [
|
||
# "asmsmallfile.txt", "mediumasmfile.txt",
|
||
# "largeasmfile.txt",
|
||
"large_asmfile.txt",
|
||
# "hugeasmfile.txt", "trainasmfile.txt"
|
||
]
|
||
|
||
df_list = []
|
||
for file in output_files:
|
||
df = pd.read_csv(file, header=None) # Load each file into a pandas DataFrame
|
||
df_list.append(df)
|
||
|
||
# Concatenate all DataFrames along axis 0 (rows)
|
||
merged_df = pd.concat(df_list, axis=0)
|
||
# Assign headers to the merged DataFrame
|
||
merged_df.columns = feature_headers
|
||
|
||
# Save to CSV with headers
|
||
merged_df.to_csv("asmoutputfile.csv", index=False, header=True)
|
||
#+++++++++++++++++++++++++++++++++++++++++++++++++++
|
||
# Verify the output
|
||
dfasm = pd.read_csv("asmoutputfile.csv")
|
||
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b> 22. Files sizes of each .asm file as a feature <a id="22"></a> </b></h1>
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
|
||
# In[ ]:
|
||
|
||
|
||
# file sizes of asm files
|
||
|
||
files=os.listdir('train')
|
||
filenames=Y['Id'].tolist()
|
||
class_y=Y['Class'].tolist()
|
||
class_bytes=[]
|
||
sizebytes=[]
|
||
fnames=[]
|
||
for file in files:
|
||
# print(os.stat('byteFiles/0A32eTdBKayjCWhZqDOQ.txt'))
|
||
# os.stat_result(st_mode=33206, st_ino=1125899906874507, st_dev=3561571700, st_nlink=1, st_uid=0, st_gid=0,
|
||
# st_size=3680109, st_atime=1519638522, st_mtime=1519638522, st_ctime=1519638522)
|
||
# read more about os.stat: here https://www.tutorialspoint.com/python/os_stat.htm
|
||
statinfo=os.stat('train/'+file)
|
||
# split the file name at '.' and take the first part of it i.e the file name
|
||
file=file.split('.')[0]
|
||
if any(file == filename for filename in filenames):
|
||
i=filenames.index(file)
|
||
class_bytes.append(class_y[i])
|
||
# converting into Mb's
|
||
sizebytes.append(statinfo.st_size/(1024.0*1024.0))
|
||
fnames.append(file)
|
||
asm_size_byte=pd.DataFrame({'Id':fnames,'size':sizebytes,'Class':class_bytes})
|
||
|
||
result_asm = asm_size_byte.fillna(0) # Replace NaN with 0
|
||
|
||
|
||
# <h4> 4.2.1.2 Distribution of .asm file sizes</h4>
|
||
|
||
# In[ ]:
|
||
|
||
|
||
#boxplot of asm files
|
||
ax = sns.boxplot(x="Class", y="size", data=asm_size_byte)
|
||
plt.title("boxplot of .bytes file sizes")
|
||
plt.show()
|
||
|
||
|
||
# 
|
||
|
||
# In[ ]:
|
||
|
||
result_asm = dfasm
|
||
|
||
result_asm = pd.merge(result_asm, asm_size_byte,on='Id', how='left')
|
||
result_asm.head()
|
||
|
||
|
||
# In[ ]:
|
||
|
||
|
||
# we normalize the data each column
|
||
result_asm = normalize(result_asm)
|
||
result_asm.head()
|
||
|
||
result_asm = result_asm.fillna(0) # Replace NaN with 0
|
||
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b> 23. Univariate analysis ONLY on .asm file features <a id="23"></a></b></h1>
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
|
||
# In[ ]:
|
||
|
||
|
||
ax = sns.boxplot(x="Class", y=".text:", data=result_asm)
|
||
plt.title("boxplot of .asm text segment")
|
||
plt.show()
|
||
|
||
|
||
#
|
||
# 
|
||
#
|
||
# <pre>
|
||
# The plot is between Text and class
|
||
# Class 1,2 and 9 can be easly separated
|
||
# </pre>
|
||
|
||
# In[ ]:
|
||
|
||
|
||
ax = sns.boxplot(x="Class", y=".Pav:", data=result_asm)
|
||
plt.title("boxplot of .asm pav segment")
|
||
plt.show()
|
||
|
||
|
||
# 
|
||
|
||
# In[ ]:
|
||
|
||
|
||
ax = sns.boxplot(x="Class", y=".data:", data=result_asm)
|
||
plt.title("boxplot of .asm data segment")
|
||
plt.show()
|
||
|
||
|
||
# 
|
||
#
|
||
# <pre>
|
||
# The plot is between data segment and class label
|
||
# class 6 and class 9 can be easily separated from given points
|
||
# </pre>
|
||
|
||
# In[ ]:
|
||
|
||
|
||
ax = sns.boxplot(x="Class", y=".bss:", data=result_asm)
|
||
plt.title("boxplot of .asm bss segment")
|
||
plt.show()
|
||
|
||
|
||
# 
|
||
#
|
||
# <pre>
|
||
# plot between bss segment and class label
|
||
# very less number of files are having bss segment
|
||
# </pre>
|
||
|
||
# In[ ]:
|
||
|
||
result_asm = result_asm.dropna(subset=['.rdata:']) # Drop rows where '.rdata:' is NaN
|
||
|
||
ax = sns.boxplot(x="Class", y=".rdata:", data=result_asm)
|
||
plt.title("boxplot of .asm rdata segment")
|
||
plt.show()
|
||
|
||
|
||
#
|
||
# [Imgur](https://imgur.com/SPZxLJL.png)
|
||
#
|
||
# <pre>
|
||
# Plot between rdata segment and Class segment
|
||
# Class 2 can be easily separated 75 pecentile files are having 1M rdata lines
|
||
# </pre>
|
||
|
||
# In[ ]:
|
||
|
||
|
||
ax = sns.boxplot(x="Class", y="jmp", data=result_asm)
|
||
plt.title("boxplot of .asm jmp opcode")
|
||
plt.show()
|
||
|
||
|
||
# 
|
||
#
|
||
#
|
||
# <pre>
|
||
# plot between jmp and Class label
|
||
# Class 1 is having frequency of 2000 approx in 75 perentile of files
|
||
# </pre>
|
||
|
||
# In[ ]:
|
||
|
||
|
||
ax = sns.boxplot(x="Class", y="mov", data=result_asm)
|
||
plt.title("boxplot of .asm mov opcode")
|
||
plt.show()
|
||
|
||
|
||
#
|
||
# 
|
||
#
|
||
#
|
||
# <pre>
|
||
# plot between Class label and mov opcode
|
||
# Class 1 is having frequency of 2000 approx in 75 perentile of files
|
||
# </pre>
|
||
|
||
# In[ ]:
|
||
|
||
|
||
ax = sns.boxplot(x="Class", y="retf", data=result_asm)
|
||
plt.title("boxplot of .asm retf opcode")
|
||
plt.show()
|
||
|
||
|
||
# 
|
||
#
|
||
#
|
||
# <pre>
|
||
# plot between Class label and retf
|
||
# Class 6 can be easily separated with opcode retf
|
||
# The frequency of retf is approx of 250.
|
||
# </pre>
|
||
|
||
# In[ ]:
|
||
|
||
|
||
ax = sns.boxplot(x="Class", y="push", data=result_asm)
|
||
plt.title("boxplot of .asm push opcode")
|
||
plt.show()
|
||
|
||
|
||
#
|
||
# 
|
||
#
|
||
# <pre>
|
||
# plot between push opcode and Class label
|
||
# Class 1 is having 75 precentile files with push opcodes of frequency 1000
|
||
# </pre>
|
||
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>24. Multivariate Analysis ONLY on .asm file features <a id="24"></a> </b></h1>
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
|
||
# In[ ]:
|
||
|
||
|
||
#multivariate analysis on asm files
|
||
#this is with perplexity 50
|
||
xtsne=TSNE(perplexity=50)
|
||
results=xtsne.fit_transform(result_asm.drop(['Id','Class'], axis=1).fillna(0))
|
||
data_y = result_asm['Class']
|
||
vis_x = results[:, 0]
|
||
vis_y = results[:, 1 ]
|
||
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
|
||
plt.colorbar(ticks=range(10))
|
||
plt.clim(0.5, 9)
|
||
plt.show()
|
||
|
||
|
||
# 
|
||
|
||
# In[ ]:
|
||
|
||
|
||
# by univariate analysis on the .asm file features we are getting very negligible information from
|
||
# 'rtn', '.BSS:' '.CODE' features, so heare we are trying multivariate analysis after removing those features
|
||
# the plot looks very messy
|
||
|
||
xtsne=TSNE(perplexity=30)
|
||
results=xtsne.fit_transform(result_asm.drop(['Id','Class', 'rtn', '.BSS:', '.CODE','size'], axis=1))
|
||
vis_x = results[:, 0]
|
||
vis_y = results[:, 1]
|
||
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
|
||
plt.colorbar(ticks=range(10))
|
||
plt.clim(0.5, 9)
|
||
plt.show()
|
||
|
||
|
||
# 
|
||
#
|
||
# <pre>
|
||
# TSNE for asm data with perplexity 50
|
||
# </pre>
|
||
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>25. Conclusion on EDA ( ONLY on .asm file features) <a id="25"></a> </b></h1>
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
|
||
# <p>
|
||
# <li>We have taken only 52 features from asm files (after reading through many blogs and research papers) </li>
|
||
# <li>The univariate analysis was done only on few important features.</li>
|
||
# <li>Take-aways
|
||
# <ul>
|
||
# <li>1. Class 3 can be easily separated because of the frequency of segments,opcodes and keywords being less </li>
|
||
# <li>2. Each feature has its unique importance in separating the Class labels.</li>
|
||
# </ul>
|
||
# </li>
|
||
# </p>
|
||
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>26. Train and test split ( ONLY on .asm file featues ) <a id="26"></a> </b></h1>
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
|
||
# In[ ]:
|
||
|
||
|
||
asm_y = result_asm['Class']
|
||
asm_x = result_asm.drop(['Id','Class','.BSS:','rtn','.CODE'], axis=1)
|
||
|
||
|
||
# In[ ]:
|
||
class_counts = asm_y.value_counts()
|
||
print(class_counts)
|
||
X_train_asm, X_test_asm, y_train_asm, y_test_asm = train_test_split(asm_x,asm_y ,stratify=asm_y,test_size=0.20)
|
||
X_train_asm, X_cv_asm, y_train_asm, y_cv_asm = train_test_split(X_train_asm, y_train_asm,stratify=y_train_asm,test_size=0.20)
|
||
|
||
|
||
# In[ ]:
|
||
|
||
|
||
print( X_cv_asm.isnull().all())
|
||
|
||
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>27. K-Nearest Neigbors ONLY on .asm file features <a id="27"></a> </b></h1>
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
|
||
# In[ ]:
|
||
|
||
|
||
# find more about KNeighborsClassifier() here http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
|
||
# -------------------------
|
||
# default parameter
|
||
# KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2,
|
||
# metric=’minkowski’, metric_params=None, n_jobs=1, **kwargs)
|
||
|
||
# methods of
|
||
# fit(X, y) : Fit the model using X as training data and y as target values
|
||
# predict(X):Predict the class labels for the provided data
|
||
# predict_proba(X):Return probability estimates for the test data X.
|
||
|
||
|
||
# find more about CalibratedClassifierCV here at http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html
|
||
# ----------------------------
|
||
# default paramters
|
||
# sklearn.calibration.CalibratedClassifierCV(base_estimator=None, method=’sigmoid’, cv=3)
|
||
#
|
||
# some of the methods of CalibratedClassifierCV()
|
||
# fit(X, y[, sample_weight]) Fit the calibrated model
|
||
# get_params([deep]) Get parameters for this estimator.
|
||
# predict(X) Predict the target of new samples.
|
||
# predict_proba(X) Posterior probabilities of classification
|
||
|
||
|
||
alpha = [x for x in range(1, 21,2)]
|
||
cv_log_error_array=[]
|
||
for i in alpha:
|
||
k_cfl=KNeighborsClassifier(n_neighbors=i)
|
||
k_cfl.fit(X_train_asm,y_train_asm)
|
||
sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train_asm, y_train_asm)
|
||
predict_y = sig_clf.predict_proba(X_cv_asm)
|
||
cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=k_cfl.classes_, eps=1e-15))
|
||
|
||
for i in range(len(cv_log_error_array)):
|
||
print ('log_loss for k = ',alpha[i],'is',cv_log_error_array[i])
|
||
|
||
best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
fig, ax = plt.subplots()
|
||
ax.plot(alpha, cv_log_error_array,c='g')
|
||
for i, txt in enumerate(np.round(cv_log_error_array,3)):
|
||
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
|
||
plt.grid()
|
||
plt.title("Cross Validation Error for each alpha")
|
||
plt.xlabel("Alpha i's")
|
||
plt.ylabel("Error measure")
|
||
plt.show()
|
||
|
||
k_cfl=KNeighborsClassifier(n_neighbors=alpha[best_alpha])
|
||
k_cfl.fit(X_train_asm,y_train_asm)
|
||
sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train_asm, y_train_asm)
|
||
pred_y=sig_clf.predict(X_test_asm)
|
||
|
||
|
||
predict_y = sig_clf.predict_proba(X_train_asm)
|
||
print ('log loss for train data',log_loss(y_train_asm, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_cv_asm)
|
||
print ('log loss for cv data',log_loss(y_cv_asm, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_test_asm)
|
||
print ('log loss for test data',log_loss(y_test_asm, predict_y))
|
||
plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm))
|
||
with open('asm_models/KNeighborsClassifier.pkl', 'wb') as model_file:
|
||
pickle.dump(sig_clf, model_file)
|
||
|
||
# 
|
||
#
|
||
# 
|
||
#
|
||
#
|
||
#
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>28. Logistic Regression ONLY on .asm file features <a id="28"></a> </b></h1>
|
||
#
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
#
|
||
|
||
# In[ ]:
|
||
|
||
|
||
# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
|
||
# ------------------------------
|
||
# default parameters
|
||
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
|
||
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5,
|
||
# class_weight=None, warm_start=False, average=False, n_iter=None)
|
||
|
||
# some of methods
|
||
# fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent.
|
||
# predict(X) Predict class labels for samples in X.
|
||
|
||
|
||
alpha = [10 ** x for x in range(-5, 4)]
|
||
cv_log_error_array=[]
|
||
for i in alpha:
|
||
logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced')
|
||
logisticR.fit(X_train_asm,y_train_asm)
|
||
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
|
||
sig_clf.fit(X_train_asm, y_train_asm)
|
||
predict_y = sig_clf.predict_proba(X_cv_asm)
|
||
cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=logisticR.classes_, eps=1e-15))
|
||
|
||
for i in range(len(cv_log_error_array)):
|
||
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
|
||
|
||
best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
fig, ax = plt.subplots()
|
||
ax.plot(alpha, cv_log_error_array,c='g')
|
||
for i, txt in enumerate(np.round(cv_log_error_array,3)):
|
||
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
|
||
plt.grid()
|
||
plt.title("Cross Validation Error for each alpha")
|
||
plt.xlabel("Alpha i's")
|
||
plt.ylabel("Error measure")
|
||
plt.show()
|
||
|
||
logisticR=LogisticRegression(penalty='l2',C=alpha[best_alpha],class_weight='balanced')
|
||
logisticR.fit(X_train_asm,y_train_asm)
|
||
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
|
||
sig_clf.fit(X_train_asm, y_train_asm)
|
||
|
||
predict_y = sig_clf.predict_proba(X_train_asm)
|
||
print ('log loss for train data',(log_loss(y_train_asm, predict_y, labels=logisticR.classes_, eps=1e-15)))
|
||
predict_y = sig_clf.predict_proba(X_cv_asm)
|
||
print ('log loss for cv data',(log_loss(y_cv_asm, predict_y, labels=logisticR.classes_, eps=1e-15)))
|
||
predict_y = sig_clf.predict_proba(X_test_asm)
|
||
print ('log loss for test data',(log_loss(y_test_asm, predict_y, labels=logisticR.classes_, eps=1e-15)))
|
||
plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm))
|
||
|
||
with open('asm_models/LogisticRegression.pkl', 'wb') as model_file:
|
||
pickle.dump(sig_clf, model_file)
|
||
# 
|
||
#
|
||
#
|
||
# 
|
||
#
|
||
#
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>29. Random Forest Classifier ONLY on .asm file features <a id="29"></a> </b></h1>
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
|
||
# In[ ]:
|
||
|
||
|
||
# --------------------------------
|
||
# default parameters
|
||
# sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, min_samples_split=2,
|
||
# min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0,
|
||
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False,
|
||
# class_weight=None)
|
||
|
||
# Some of methods of RandomForestClassifier()
|
||
# fit(X, y, [sample_weight]) Fit the SVM model according to the given training data.
|
||
# predict(X) Perform classification on samples in X.
|
||
# predict_proba (X) Perform classification on samples in X.
|
||
|
||
# some of attributes of RandomForestClassifier()
|
||
# feature_importances_ : array of shape = [n_features]
|
||
# The feature importances (the higher, the more important the feature).
|
||
|
||
alpha=[10,50,100,500,1000,2000,3000]
|
||
cv_log_error_array=[]
|
||
for i in alpha:
|
||
r_cfl=RandomForestClassifier(n_estimators=i,random_state=42,n_jobs=-1)
|
||
r_cfl.fit(X_train_asm,y_train_asm)
|
||
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train_asm, y_train_asm)
|
||
predict_y = sig_clf.predict_proba(X_cv_asm)
|
||
cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=r_cfl.classes_, eps=1e-15))
|
||
|
||
for i in range(len(cv_log_error_array)):
|
||
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
|
||
|
||
|
||
best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
fig, ax = plt.subplots()
|
||
ax.plot(alpha, cv_log_error_array,c='g')
|
||
for i, txt in enumerate(np.round(cv_log_error_array,3)):
|
||
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
|
||
plt.grid()
|
||
plt.title("Cross Validation Error for each alpha")
|
||
plt.xlabel("Alpha i's")
|
||
plt.ylabel("Error measure")
|
||
plt.show()
|
||
|
||
r_cfl=RandomForestClassifier(n_estimators=alpha[best_alpha],random_state=42,n_jobs=-1)
|
||
r_cfl.fit(X_train_asm,y_train_asm)
|
||
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train_asm, y_train_asm)
|
||
predict_y = sig_clf.predict_proba(X_train_asm)
|
||
print ('log loss for train data',(log_loss(y_train_asm, predict_y, labels=sig_clf.classes_, eps=1e-15)))
|
||
predict_y = sig_clf.predict_proba(X_cv_asm)
|
||
print ('log loss for cv data',(log_loss(y_cv_asm, predict_y, labels=sig_clf.classes_, eps=1e-15)))
|
||
predict_y = sig_clf.predict_proba(X_test_asm)
|
||
print ('log loss for test data',(log_loss(y_test_asm, predict_y, labels=sig_clf.classes_, eps=1e-15)))
|
||
plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm))
|
||
|
||
|
||
with open('asm_models/RandomForestClassifier.pkl', 'wb') as model_file:
|
||
pickle.dump(sig_clf, model_file)
|
||
# 
|
||
#
|
||
# 
|
||
#
|
||
#
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>30. XgBoost Classifier ONLY on .asm file features <a id="30"></a> </b></h1>
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
|
||
# In[ ]:
|
||
|
||
|
||
# Training a hyper-parameter tuned Xg-Boost regressor on our train data
|
||
|
||
# find more about XGBClassifier function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#xgboost.XGBClassifier
|
||
# -------------------------
|
||
# default paramters
|
||
# class xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True,
|
||
# objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
|
||
# max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1,
|
||
# scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs)
|
||
|
||
# some of methods of RandomForestRegressor()
|
||
# fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None)
|
||
# get_params([deep]) Get parameters for this estimator.
|
||
# predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe.
|
||
# get_score(importance_type='weight') -> get the feature importance
|
||
|
||
alpha=[10,50,100,500,1000,2000,3000]
|
||
cv_log_error_array=[]
|
||
for i in alpha:
|
||
x_cfl=XGBClassifier(n_estimators=i,nthread=-1)
|
||
x_cfl.fit(X_train_asm,y_train_asm)
|
||
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train_asm, y_train_asm)
|
||
predict_y = sig_clf.predict_proba(X_cv_asm)
|
||
cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=x_cfl.classes_, eps=1e-15))
|
||
|
||
for i in range(len(cv_log_error_array)):
|
||
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
|
||
|
||
|
||
best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
fig, ax = plt.subplots()
|
||
ax.plot(alpha, cv_log_error_array,c='g')
|
||
for i, txt in enumerate(np.round(cv_log_error_array,3)):
|
||
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
|
||
plt.grid()
|
||
plt.title("Cross Validation Error for each alpha")
|
||
plt.xlabel("Alpha i's")
|
||
plt.ylabel("Error measure")
|
||
plt.show()
|
||
|
||
x_cfl=XGBClassifier(n_estimators=alpha[best_alpha],nthread=-1)
|
||
x_cfl.fit(X_train_asm,y_train_asm)
|
||
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train_asm, y_train_asm)
|
||
|
||
predict_y = sig_clf.predict_proba(X_train_asm)
|
||
|
||
print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train_asm, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_cv_asm)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv_asm, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_test_asm)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test_asm, predict_y))
|
||
plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm))
|
||
|
||
with open('asm_models/XGBClassifier.pkl', 'wb') as model_file:
|
||
pickle.dump(sig_clf, model_file)
|
||
|
||
# 
|
||
#
|
||
#
|
||
# 
|
||
#
|
||
#
|
||
# <h1 style="font-size:250%; font-family:cursive; color:#ff6666;"><b>31. Xgboost Classifier with best hyperparameters ( ONLY on .asm file features ) <a id="31"></a></b></h1>
|
||
#
|
||
# #### [Back to the top](#0)
|
||
#
|
||
|
||
# In[ ]:
|
||
|
||
|
||
x_cfl=XGBClassifier()
|
||
|
||
prams={
|
||
'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2],
|
||
'n_estimators':[100,200,500,1000,2000],
|
||
'max_depth':[3,5,10],
|
||
'colsample_bytree':[0.1,0.3,0.5,1],
|
||
'subsample':[0.1,0.3,0.5,1]
|
||
}
|
||
random_cfl=RandomizedSearchCV(x_cfl,param_distributions=prams,verbose=10,n_jobs=-1,)
|
||
random_cfl.fit(X_train_asm,y_train_asm)
|
||
|
||
|
||
# In[ ]:
|
||
|
||
|
||
print (random_cfl.best_params_)
|
||
|
||
|
||
# In[ ]:
|
||
|
||
|
||
# Training a hyper-parameter tuned Xg-Boost regressor on our train data
|
||
|
||
# find more about XGBClassifier function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#xgboost.XGBClassifier
|
||
# -------------------------
|
||
# default paramters
|
||
# class xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True,
|
||
# objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
|
||
# max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1,
|
||
# scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs)
|
||
|
||
# some of methods of RandomForestRegressor()
|
||
# fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None)
|
||
# get_params([deep]) Get parameters for this estimator.
|
||
# predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe.
|
||
# get_score(importance_type='weight') -> get the feature importance
|
||
|
||
x_cfl=XGBClassifier(n_estimators=200,subsample=0.5,learning_rate=0.15,colsample_bytree=0.5,max_depth=3)
|
||
x_cfl.fit(X_train_asm,y_train_asm)
|
||
c_cfl=CalibratedClassifierCV(x_cfl,method='sigmoid')
|
||
c_cfl.fit(X_train_asm,y_train_asm)
|
||
|
||
predict_y = c_cfl.predict_proba(X_train_asm)
|
||
print ('train loss',log_loss(y_train_asm, predict_y))
|
||
predict_y = c_cfl.predict_proba(X_cv_asm)
|
||
print ('cv loss',log_loss(y_cv_asm, predict_y))
|
||
predict_y = c_cfl.predict_proba(X_test_asm)
|
||
print ('test loss',log_loss(y_test_asm, predict_y))
|
||
|