# # In[2]: # Import necessary libraries import warnings import shutil import IPython import os import pandas as pd import matplotlib import matplotlib.pyplot as plt import seaborn as sns import numpy as np from tqdm import tqdm import pickle from sklearn.manifold import TSNE from sklearn import preprocessing from multiprocessing import Process, Pool import multiprocessing import codecs import random as r from xgboost import XGBClassifier from sklearn.model_selection import RandomizedSearchCV, train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.calibration import CalibratedClassifierCV from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import log_loss, confusion_matrix from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier import re from nltk.util import ngrams from sklearn.feature_selection import SelectKBest, chi2, f_regression import scipy.sparse import gc import pickle as pkl from datetime import datetime as dt import dask.dataframe as dd import matplotlib.pyplot as plt # In[2]: #separating byte files and asm files source = 'train' destination_1 = 'byteFiles' destination_2 = 'asmFiles' # # https://stackoverflow.com/a/29651514 def normalize(df): result1 = df.copy() for feature_name in df.columns: if (str(feature_name) != str('Id') and str(feature_name)!=str('Class')): max_value = df[feature_name].max() min_value = df[feature_name].min() result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value) return result1 def plot_confusion_matrix(test_y, predict_y): C = confusion_matrix(test_y, predict_y) print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100) # C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j A =(((C.T)/(C.sum(axis=1))).T) B =(C/C.sum(axis=0)) labels = [1,2,3,4,5,6,7,8,9] cmap=sns.light_palette("green") # representing A in heatmap format print("-"*50, "Confusion matrix", "-"*50) plt.figure(figsize=(10,5)) sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.show() print("-"*50, "Precision matrix", "-"*50) plt.figure(figsize=(10,5)) sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.show() print("Sum of columns in precision matrix",B.sum(axis=0)) # representing B in heatmap format print("-"*50, "Recall matrix" , "-"*50) plt.figure(figsize=(10,5)) sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.show() print("Sum of rows in precision matrix",A.sum(axis=1)) Y = pd.read_csv("trainLabels.csv") #+++++++++++++++++++++++++++++++++++++++++++++++++++ import os folder_1 ='first' folder_2 ='second' folder_3 ='third' folder_4 ='fourth' folder_5 ='fifth' folder_6 = 'output' for i in [folder_1,folder_2,folder_3,folder_4,folder_5,folder_6]: if not os.path.isdir(i): os.makedirs(i) source='train/' files = os.listdir('train') data=range(0,len(files)) count=0 for i in range(0,len(files)): if i % 5==0: shutil.copy(source+files[data[i]],'first') elif i%5==1: shutil.copy(source+files[data[i]],'second') elif i%5 ==2: shutil.copy(source+files[data[i]],'third') elif i%5 ==3: shutil.copy(source+files[data[i]],'fourth') elif i%5==4: shutil.copy(source+files[data[i]],'fifth') # In[24]: # http://flint.cs.yale.edu/cs421/papers/x86-asm/asm.html opcodefile = open("opcodes.txt", 'w+') def firstprocess(): #The prefixes tells about the segments that are present in the asm files #There are 450 segments(approx) present in all asm files. #this prefixes are best segments that gives us best values. #https://en.wikipedia.org/wiki/Data_segment prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE'] #this are opcodes that are used to get best results #https://en.wikipedia.org/wiki/X86_instruction_listings opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx'] #best keywords that are taken from different blogs keywords = ['.dll','std::',':dword'] #Below taken registers are general purpose registers and special registers #All the registers which are taken are best registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip'] file1=open("asmsmallfile.txt","w+") files = os.listdir('first') for f in files: #filling the values with zeros into the arrays prefixescount=np.zeros(len(prefixes),dtype=int) opcodescount=np.zeros(len(opcodes),dtype=int) keywordcount=np.zeros(len(keywords),dtype=int) registerscount=np.zeros(len(registers),dtype=int) features=[] f2=f.split('.')[0] file1.write(f2+",") opcodefile.write(f2+" ") # https://docs.python.org/3/library/codecs.html#codecs.ignore_errors # https://docs.python.org/3/library/codecs.html#codecs.Codec.encode with codecs.open('first/'+f,encoding='cp1252',errors ='replace') as fli: for lines in fli: # https://www.tutorialspoint.com/python3/string_rstrip.htm line=lines.rstrip().split() if not line: continue l=line[0] print("processing") #counting the prefixs in each and every line for i in range(len(prefixes)): if prefixes[i] in line[0]: prefixescount[i]+=1 line=line[1:] #counting the opcodes in each and every line for i in range(len(opcodes)): if any(opcodes[i]==li for li in line): features.append(opcodes[i]) opcodescount[i]+=1 #counting registers in the line for i in range(len(registers)): for li in line: # we will use registers only in 'text' and 'CODE' segments if registers[i] in li and ('text' in l or 'CODE' in l): registerscount[i]+=1 #counting keywords in the line for i in range(len(keywords)): for li in line: if keywords[i] in li: keywordcount[i]+=1 #pushing the values into the file after reading whole file for prefix in prefixescount: file1.write(str(prefix)+",") for opcode in opcodescount: file1.write(str(opcode)+",") for register in registerscount: file1.write(str(register)+",") for key in keywordcount: file1.write(str(key)+",") file1.write("\n") file1.close() #same as above def secondprocess(): prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE'] opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx'] keywords = ['.dll','std::',':dword'] registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip'] file1=open("mediumasmfile.txt","w+") files = os.listdir('second') for f in files: prefixescount=np.zeros(len(prefixes),dtype=int) opcodescount=np.zeros(len(opcodes),dtype=int) keywordcount=np.zeros(len(keywords),dtype=int) registerscount=np.zeros(len(registers),dtype=int) features=[] f2=f.split('.')[0] file1.write(f2+",") opcodefile.write(f2+" ") with codecs.open('second/'+f,encoding='cp1252',errors ='replace') as fli: for lines in fli: line=lines.rstrip().split() l=line[0] for i in range(len(prefixes)): if prefixes[i] in line[0]: prefixescount[i]+=1 line=line[1:] for i in range(len(opcodes)): if any(opcodes[i]==li for li in line): features.append(opcodes[i]) opcodescount[i]+=1 for i in range(len(registers)): for li in line: if registers[i] in li and ('text' in l or 'CODE' in l): registerscount[i]+=1 for i in range(len(keywords)): for li in line: if keywords[i] in li: keywordcount[i]+=1 for prefix in prefixescount: file1.write(str(prefix)+",") for opcode in opcodescount: file1.write(str(opcode)+",") for register in registerscount: file1.write(str(register)+",") for key in keywordcount: file1.write(str(key)+",") file1.write("\n") file1.close() # same as smallprocess() functions def thirdprocess(): prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE'] opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx'] keywords = ['.dll','std::',':dword'] registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip'] file1=open("largeasmfile.txt","w+") files = os.listdir('third') for f in files: prefixescount=np.zeros(len(prefixes),dtype=int) opcodescount=np.zeros(len(opcodes),dtype=int) keywordcount=np.zeros(len(keywords),dtype=int) registerscount=np.zeros(len(registers),dtype=int) features=[] f2=f.split('.')[0] file1.write(f2+",") opcodefile.write(f2+" ") with codecs.open('third/'+f,encoding='cp1252',errors ='replace') as fli: for lines in fli: line=lines.rstrip().split() l=line[0] for i in range(len(prefixes)): if prefixes[i] in line[0]: prefixescount[i]+=1 line=line[1:] for i in range(len(opcodes)): if any(opcodes[i]==li for li in line): features.append(opcodes[i]) opcodescount[i]+=1 for i in range(len(registers)): for li in line: if registers[i] in li and ('text' in l or 'CODE' in l): registerscount[i]+=1 for i in range(len(keywords)): for li in line: if keywords[i] in li: keywordcount[i]+=1 for prefix in prefixescount: file1.write(str(prefix)+",") for opcode in opcodescount: file1.write(str(opcode)+",") for register in registerscount: file1.write(str(register)+",") for key in keywordcount: file1.write(str(key)+",") file1.write("\n") file1.close() def fourthprocess(): prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE'] opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx'] keywords = ['.dll','std::',':dword'] registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip'] file1=open("hugeasmfile.txt","w+") files = os.listdir('fourth/') for f in files: prefixescount=np.zeros(len(prefixes),dtype=int) opcodescount=np.zeros(len(opcodes),dtype=int) keywordcount=np.zeros(len(keywords),dtype=int) registerscount=np.zeros(len(registers),dtype=int) features=[] f2=f.split('.')[0] file1.write(f2+",") opcodefile.write(f2+" ") with codecs.open('fourth/'+f,encoding='cp1252',errors ='replace') as fli: for lines in fli: line=lines.rstrip().split() l=line[0] for i in range(len(prefixes)): if prefixes[i] in line[0]: prefixescount[i]+=1 line=line[1:] for i in range(len(opcodes)): if any(opcodes[i]==li for li in line): features.append(opcodes[i]) opcodescount[i]+=1 for i in range(len(registers)): for li in line: if registers[i] in li and ('text' in l or 'CODE' in l): registerscount[i]+=1 for i in range(len(keywords)): for li in line: if keywords[i] in li: keywordcount[i]+=1 for prefix in prefixescount: file1.write(str(prefix)+",") for opcode in opcodescount: file1.write(str(opcode)+",") for register in registerscount: file1.write(str(register)+",") for key in keywordcount: file1.write(str(key)+",") file1.write("\n") file1.close() def fifthprocess(): prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE'] opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx'] keywords = ['.dll','std::',':dword'] registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip'] file1=open("trainasmfile.txt","w+") files = os.listdir('fifth/') for f in files: prefixescount=np.zeros(len(prefixes),dtype=int) opcodescount=np.zeros(len(opcodes),dtype=int) keywordcount=np.zeros(len(keywords),dtype=int) registerscount=np.zeros(len(registers),dtype=int) features=[] f2=f.split('.')[0] file1.write(f2+",") opcodefile.write(f2+" ") with codecs.open('fifth/'+f,encoding='cp1252',errors ='replace') as fli: for lines in fli: line=lines.rstrip().split() l=line[0] for i in range(len(prefixes)): if prefixes[i] in line[0]: prefixescount[i]+=1 line=line[1:] for i in range(len(opcodes)): if any(opcodes[i]==li for li in line): features.append(opcodes[i]) opcodescount[i]+=1 for i in range(len(registers)): for li in line: if registers[i] in li and ('text' in l or 'CODE' in l): registerscount[i]+=1 for i in range(len(keywords)): for li in line: if keywords[i] in li: keywordcount[i]+=1 for prefix in prefixescount: file1.write(str(prefix)+",") for opcode in opcodescount: file1.write(str(opcode)+",") for register in registerscount: file1.write(str(register)+",") for key in keywordcount: file1.write(str(key)+",") file1.write("\n") file1.close() def main(): #the below code is used for multiprogramming #the number of process depends upon the number of cores present System #process is used to call multiprogramming manager=multiprocessing.Manager() p1=Process(target=firstprocess) p2=Process(target=secondprocess) p3=Process(target=thirdprocess) p4=Process(target=fourthprocess) p5=Process(target=fifthprocess) #p1.start() is used to start the thread execution p1.start() p2.start() p3.start() p4.start() p5.start() #After completion all the threads are joined p1.join() p2.join() p3.join() p4.join() p5.join() if __name__=="__main__": main() # Manually assign headers if not present feature_headers = ['Id','.text:', '.Pav:', '.idata:', '.data:', '.bss:', '.rdata:', '.edata:', '.rsrc:', '.tls:', '.reloc:', '.BSS:', '.CODE', 'jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add', 'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb', 'jz', 'rtn', 'lea', 'movzx','.dll','std::',':dword','edx', 'esi', 'eax', 'ebx', 'ecx', 'edi', 'ebp', 'esp', 'eip',",",'start' ] # File names for merging output_files = [ "asmsmallfile.txt", "mediumasmfile.txt", "largeasmfile.txt", "hugeasmfile.txt", "trainasmfile.txt" ] df_list = [] for file in output_files: df = pd.read_csv(file, header=None) # Load each file into a pandas DataFrame df_list.append(df) # Concatenate all DataFrames along axis 0 (rows) merged_df = pd.concat(df_list, axis=0) # Assign headers to the merged DataFrame merged_df.columns = feature_headers # Save to CSV with headers merged_df.to_csv("asmoutputfile.csv", index=False, header=True) #+++++++++++++++++++++++++++++++++++++++++++++++++++ # Verify the output dfasm = pd.read_csv("asmoutputfile.csv") #

22. Files sizes of each .asm file as a feature

# # #### [Back to the top](#0) # # In[ ]: # file sizes of asm files files=os.listdir('train') filenames=Y['Id'].tolist() class_y=Y['Class'].tolist() class_bytes=[] sizebytes=[] fnames=[] for file in files: # print(os.stat('byteFiles/0A32eTdBKayjCWhZqDOQ.txt')) # os.stat_result(st_mode=33206, st_ino=1125899906874507, st_dev=3561571700, st_nlink=1, st_uid=0, st_gid=0, # st_size=3680109, st_atime=1519638522, st_mtime=1519638522, st_ctime=1519638522) # read more about os.stat: here https://www.tutorialspoint.com/python/os_stat.htm statinfo=os.stat('train/'+file) # split the file name at '.' and take the first part of it i.e the file name file=file.split('.')[0] if any(file == filename for filename in filenames): i=filenames.index(file) class_bytes.append(class_y[i]) # converting into Mb's sizebytes.append(statinfo.st_size/(1024.0*1024.0)) fnames.append(file) asm_size_byte=pd.DataFrame({'Id':fnames,'size':sizebytes,'Class':class_bytes}) result_asm = asm_size_byte.fillna(0) # Replace NaN with 0 #

4.2.1.2 Distribution of .asm file sizes

# In[ ]: #boxplot of asm files ax = sns.boxplot(x="Class", y="size", data=asm_size_byte) plt.title("boxplot of .bytes file sizes") plt.show() # ![Imgur](https://imgur.com/egYeXAJ.png) # In[ ]: result_asm = dfasm result_asm = pd.merge(result_asm, asm_size_byte,on='Id', how='left') result_asm.head() # In[ ]: # we normalize the data each column result_asm = normalize(result_asm) result_asm.head() result_asm = result_asm.fillna(0) # Replace NaN with 0 #

23. Univariate analysis ONLY on .asm file features

# # #### [Back to the top](#0) # # In[ ]: ax = sns.boxplot(x="Class", y=".text:", data=result_asm) plt.title("boxplot of .asm text segment") plt.show() # # ![Imgur](https://imgur.com/5jWiNtY.png) # #
# The plot is between Text and class 
# Class 1,2 and 9 can be easly separated
# 
# In[ ]: ax = sns.boxplot(x="Class", y=".Pav:", data=result_asm) plt.title("boxplot of .asm pav segment") plt.show() # ![Imgur](https://imgur.com/clvpMB9.png) # In[ ]: ax = sns.boxplot(x="Class", y=".data:", data=result_asm) plt.title("boxplot of .asm data segment") plt.show() # ![Imgur](https://imgur.com/CqJhugg.png) # #
# The plot is between data segment and class label 
# class 6 and class 9 can be easily separated from given points
# 
# In[ ]: ax = sns.boxplot(x="Class", y=".bss:", data=result_asm) plt.title("boxplot of .asm bss segment") plt.show() # ![Imgur](https://imgur.com/GKa73JO.png) # #
# plot between bss segment and class label
# very less number of files are having bss segment
# 
# In[ ]: result_asm = result_asm.dropna(subset=['.rdata:']) # Drop rows where '.rdata:' is NaN ax = sns.boxplot(x="Class", y=".rdata:", data=result_asm) plt.title("boxplot of .asm rdata segment") plt.show() # # [Imgur](https://imgur.com/SPZxLJL.png) # #
# Plot between rdata segment and Class segment
# Class 2 can be easily separated 75 pecentile files are having 1M rdata lines
# 
# In[ ]: ax = sns.boxplot(x="Class", y="jmp", data=result_asm) plt.title("boxplot of .asm jmp opcode") plt.show() # ![Imgur](https://imgur.com/0e0ylU2.png) # # #
# plot between jmp and Class label
# Class 1 is having frequency of 2000 approx in 75 perentile of files
# 
# In[ ]: ax = sns.boxplot(x="Class", y="mov", data=result_asm) plt.title("boxplot of .asm mov opcode") plt.show() # # ![Imgur](https://imgur.com/Jr5dOJk.png) # # #
# plot between Class label and mov opcode
# Class 1 is having frequency of 2000 approx in 75 perentile of files
# 
# In[ ]: ax = sns.boxplot(x="Class", y="retf", data=result_asm) plt.title("boxplot of .asm retf opcode") plt.show() # ![Imgur](https://imgur.com/VQ25RTI.png) # # #
# plot between Class label and retf
# Class 6 can be easily separated with opcode retf
# The frequency of retf is approx of 250.
# 
# In[ ]: ax = sns.boxplot(x="Class", y="push", data=result_asm) plt.title("boxplot of .asm push opcode") plt.show() # # ![Imgur](https://imgur.com/FLpSOdK.png) # #
# plot between push opcode and Class label
# Class 1 is having 75 precentile files with push opcodes of frequency 1000
# 
#

24. Multivariate Analysis ONLY on .asm file features

# # #### [Back to the top](#0) # # In[ ]: #multivariate analysis on asm files #this is with perplexity 50 xtsne=TSNE(perplexity=50) results=xtsne.fit_transform(result_asm.drop(['Id','Class'], axis=1).fillna(0)) data_y = result_asm['Class'] vis_x = results[:, 0] vis_y = results[:, 1 ] plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9)) plt.colorbar(ticks=range(10)) plt.clim(0.5, 9) plt.show() # ![Imgur](https://imgur.com/tR4nhGB.png) # In[ ]: # by univariate analysis on the .asm file features we are getting very negligible information from # 'rtn', '.BSS:' '.CODE' features, so heare we are trying multivariate analysis after removing those features # the plot looks very messy xtsne=TSNE(perplexity=30) results=xtsne.fit_transform(result_asm.drop(['Id','Class', 'rtn', '.BSS:', '.CODE','size'], axis=1)) vis_x = results[:, 0] vis_y = results[:, 1] plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9)) plt.colorbar(ticks=range(10)) plt.clim(0.5, 9) plt.show() # ![Imgur](https://imgur.com/3Fevxnl.png) # #
# TSNE for asm data with perplexity 50
# 
#

25. Conclusion on EDA ( ONLY on .asm file features)

# # #### [Back to the top](#0) # #

#

  • We have taken only 52 features from asm files (after reading through many blogs and research papers)
  • #
  • The univariate analysis was done only on few important features.
  • #
  • Take-aways # #
  • #

    #

    26. Train and test split ( ONLY on .asm file featues )

    # # #### [Back to the top](#0) # # In[ ]: asm_y = result_asm['Class'] asm_x = result_asm.drop(['Id','Class','.BSS:','rtn','.CODE'], axis=1) # In[ ]: class_counts = asm_y.value_counts() print(class_counts) X_train_asm, X_test_asm, y_train_asm, y_test_asm = train_test_split(asm_x,asm_y ,stratify=asm_y,test_size=0.20) X_train_asm, X_cv_asm, y_train_asm, y_cv_asm = train_test_split(X_train_asm, y_train_asm,stratify=y_train_asm,test_size=0.20) # In[ ]: print( X_cv_asm.isnull().all()) #

    27. K-Nearest Neigbors ONLY on .asm file features

    # # #### [Back to the top](#0) # # In[ ]: # find more about KNeighborsClassifier() here http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html # ------------------------- # default parameter # KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, # metric=’minkowski’, metric_params=None, n_jobs=1, **kwargs) # methods of # fit(X, y) : Fit the model using X as training data and y as target values # predict(X):Predict the class labels for the provided data # predict_proba(X):Return probability estimates for the test data X. # find more about CalibratedClassifierCV here at http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html # ---------------------------- # default paramters # sklearn.calibration.CalibratedClassifierCV(base_estimator=None, method=’sigmoid’, cv=3) # # some of the methods of CalibratedClassifierCV() # fit(X, y[, sample_weight]) Fit the calibrated model # get_params([deep]) Get parameters for this estimator. # predict(X) Predict the target of new samples. # predict_proba(X) Posterior probabilities of classification alpha = [x for x in range(1, 21,2)] cv_log_error_array=[] for i in alpha: k_cfl=KNeighborsClassifier(n_neighbors=i) k_cfl.fit(X_train_asm,y_train_asm) sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid") sig_clf.fit(X_train_asm, y_train_asm) predict_y = sig_clf.predict_proba(X_cv_asm) cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=k_cfl.classes_, eps=1e-15)) for i in range(len(cv_log_error_array)): print ('log_loss for k = ',alpha[i],'is',cv_log_error_array[i]) best_alpha = np.argmin(cv_log_error_array) fig, ax = plt.subplots() ax.plot(alpha, cv_log_error_array,c='g') for i, txt in enumerate(np.round(cv_log_error_array,3)): ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() k_cfl=KNeighborsClassifier(n_neighbors=alpha[best_alpha]) k_cfl.fit(X_train_asm,y_train_asm) sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid") sig_clf.fit(X_train_asm, y_train_asm) pred_y=sig_clf.predict(X_test_asm) predict_y = sig_clf.predict_proba(X_train_asm) print ('log loss for train data',log_loss(y_train_asm, predict_y)) predict_y = sig_clf.predict_proba(X_cv_asm) print ('log loss for cv data',log_loss(y_cv_asm, predict_y)) predict_y = sig_clf.predict_proba(X_test_asm) print ('log loss for test data',log_loss(y_test_asm, predict_y)) plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm)) with open('asm_models/KNeighborsClassifier.pkl', 'wb') as model_file: pickle.dump(sig_clf, model_file) # ![Imgur](https://imgur.com/xtCOdJi.png) # # ![Imgur](https://imgur.com/vTUky0K.png) # # # #

    28. Logistic Regression ONLY on .asm file features

    # # # #### [Back to the top](#0) # # # In[ ]: # read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html # ------------------------------ # default parameters # SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, # shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5, # class_weight=None, warm_start=False, average=False, n_iter=None) # some of methods # fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent. # predict(X) Predict class labels for samples in X. alpha = [10 ** x for x in range(-5, 4)] cv_log_error_array=[] for i in alpha: logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced') logisticR.fit(X_train_asm,y_train_asm) sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid") sig_clf.fit(X_train_asm, y_train_asm) predict_y = sig_clf.predict_proba(X_cv_asm) cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=logisticR.classes_, eps=1e-15)) for i in range(len(cv_log_error_array)): print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i]) best_alpha = np.argmin(cv_log_error_array) fig, ax = plt.subplots() ax.plot(alpha, cv_log_error_array,c='g') for i, txt in enumerate(np.round(cv_log_error_array,3)): ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() logisticR=LogisticRegression(penalty='l2',C=alpha[best_alpha],class_weight='balanced') logisticR.fit(X_train_asm,y_train_asm) sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid") sig_clf.fit(X_train_asm, y_train_asm) predict_y = sig_clf.predict_proba(X_train_asm) print ('log loss for train data',(log_loss(y_train_asm, predict_y, labels=logisticR.classes_, eps=1e-15))) predict_y = sig_clf.predict_proba(X_cv_asm) print ('log loss for cv data',(log_loss(y_cv_asm, predict_y, labels=logisticR.classes_, eps=1e-15))) predict_y = sig_clf.predict_proba(X_test_asm) print ('log loss for test data',(log_loss(y_test_asm, predict_y, labels=logisticR.classes_, eps=1e-15))) plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm)) with open('asm_models/LogisticRegression.pkl', 'wb') as model_file: pickle.dump(sig_clf, model_file) # ![Imgur](https://imgur.com/8uIh7cZ.png) # # # ![Imgur](https://imgur.com/wV4w7Er.png) # # #

    29. Random Forest Classifier ONLY on .asm file features

    # # #### [Back to the top](#0) # # In[ ]: # -------------------------------- # default parameters # sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, min_samples_split=2, # min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, # min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, # class_weight=None) # Some of methods of RandomForestClassifier() # fit(X, y, [sample_weight]) Fit the SVM model according to the given training data. # predict(X) Perform classification on samples in X. # predict_proba (X) Perform classification on samples in X. # some of attributes of RandomForestClassifier() # feature_importances_ : array of shape = [n_features] # The feature importances (the higher, the more important the feature). alpha=[10,50,100,500,1000,2000,3000] cv_log_error_array=[] for i in alpha: r_cfl=RandomForestClassifier(n_estimators=i,random_state=42,n_jobs=-1) r_cfl.fit(X_train_asm,y_train_asm) sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid") sig_clf.fit(X_train_asm, y_train_asm) predict_y = sig_clf.predict_proba(X_cv_asm) cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=r_cfl.classes_, eps=1e-15)) for i in range(len(cv_log_error_array)): print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i]) best_alpha = np.argmin(cv_log_error_array) fig, ax = plt.subplots() ax.plot(alpha, cv_log_error_array,c='g') for i, txt in enumerate(np.round(cv_log_error_array,3)): ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() r_cfl=RandomForestClassifier(n_estimators=alpha[best_alpha],random_state=42,n_jobs=-1) r_cfl.fit(X_train_asm,y_train_asm) sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid") sig_clf.fit(X_train_asm, y_train_asm) predict_y = sig_clf.predict_proba(X_train_asm) print ('log loss for train data',(log_loss(y_train_asm, predict_y, labels=sig_clf.classes_, eps=1e-15))) predict_y = sig_clf.predict_proba(X_cv_asm) print ('log loss for cv data',(log_loss(y_cv_asm, predict_y, labels=sig_clf.classes_, eps=1e-15))) predict_y = sig_clf.predict_proba(X_test_asm) print ('log loss for test data',(log_loss(y_test_asm, predict_y, labels=sig_clf.classes_, eps=1e-15))) plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm)) with open('asm_models/RandomForestClassifier.pkl', 'wb') as model_file: pickle.dump(sig_clf, model_file) # ![Imgur](https://imgur.com/C431Dn7.png) # # ![Imgur](https://imgur.com/RwZwWtJ.png) # # #

    30. XgBoost Classifier ONLY on .asm file features

    # # #### [Back to the top](#0) # # In[ ]: # Training a hyper-parameter tuned Xg-Boost regressor on our train data # find more about XGBClassifier function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#xgboost.XGBClassifier # ------------------------- # default paramters # class xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, # objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, # max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, # scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs) # some of methods of RandomForestRegressor() # fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None) # get_params([deep]) Get parameters for this estimator. # predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe. # get_score(importance_type='weight') -> get the feature importance alpha=[10,50,100,500,1000,2000,3000] cv_log_error_array=[] for i in alpha: x_cfl=XGBClassifier(n_estimators=i,nthread=-1) x_cfl.fit(X_train_asm,y_train_asm) sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid") sig_clf.fit(X_train_asm, y_train_asm) predict_y = sig_clf.predict_proba(X_cv_asm) cv_log_error_array.append(log_loss(y_cv_asm, predict_y, labels=x_cfl.classes_, eps=1e-15)) for i in range(len(cv_log_error_array)): print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i]) best_alpha = np.argmin(cv_log_error_array) fig, ax = plt.subplots() ax.plot(alpha, cv_log_error_array,c='g') for i, txt in enumerate(np.round(cv_log_error_array,3)): ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() x_cfl=XGBClassifier(n_estimators=alpha[best_alpha],nthread=-1) x_cfl.fit(X_train_asm,y_train_asm) sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid") sig_clf.fit(X_train_asm, y_train_asm) predict_y = sig_clf.predict_proba(X_train_asm) print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train_asm, predict_y)) predict_y = sig_clf.predict_proba(X_cv_asm) print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv_asm, predict_y)) predict_y = sig_clf.predict_proba(X_test_asm) print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test_asm, predict_y)) plot_confusion_matrix(y_test_asm,sig_clf.predict(X_test_asm)) with open('asm_models/XGBClassifier.pkl', 'wb') as model_file: pickle.dump(sig_clf, model_file) # ![Imgur](https://imgur.com/JMb1GDQ.png) # # # ![Imgur](https://imgur.com/mp296Le.png) # # #

    31. Xgboost Classifier with best hyperparameters ( ONLY on .asm file features )

    # # #### [Back to the top](#0) # # In[ ]: x_cfl=XGBClassifier() prams={ 'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2], 'n_estimators':[100,200,500,1000,2000], 'max_depth':[3,5,10], 'colsample_bytree':[0.1,0.3,0.5,1], 'subsample':[0.1,0.3,0.5,1] } random_cfl=RandomizedSearchCV(x_cfl,param_distributions=prams,verbose=10,n_jobs=-1,) random_cfl.fit(X_train_asm,y_train_asm) # In[ ]: print (random_cfl.best_params_) # In[ ]: # Training a hyper-parameter tuned Xg-Boost regressor on our train data # find more about XGBClassifier function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#xgboost.XGBClassifier # ------------------------- # default paramters # class xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, # objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, # max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, # scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs) # some of methods of RandomForestRegressor() # fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None) # get_params([deep]) Get parameters for this estimator. # predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe. # get_score(importance_type='weight') -> get the feature importance x_cfl=XGBClassifier(n_estimators=200,subsample=0.5,learning_rate=0.15,colsample_bytree=0.5,max_depth=3) x_cfl.fit(X_train_asm,y_train_asm) c_cfl=CalibratedClassifierCV(x_cfl,method='sigmoid') c_cfl.fit(X_train_asm,y_train_asm) predict_y = c_cfl.predict_proba(X_train_asm) print ('train loss',log_loss(y_train_asm, predict_y)) predict_y = c_cfl.predict_proba(X_cv_asm) print ('cv loss',log_loss(y_cv_asm, predict_y)) predict_y = c_cfl.predict_proba(X_test_asm) print ('test loss',log_loss(y_test_asm, predict_y))