#!/usr/bin/env python # coding: utf-8 # In[1]: import warnings warnings.filterwarnings("ignore") import shutil import os import pandas as pd import matplotlib matplotlib.use('nbAgg') import matplotlib.pyplot as plt import seaborn as sns import numpy as np from tqdm import tqdm import pickle from sklearn.manifold import TSNE from sklearn import preprocessing from sklearn.model_selection import RandomizedSearchCV, train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.calibration import CalibratedClassifierCV from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import log_loss, confusion_matrix from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectKBest, chi2, f_regression import re from nltk.util import ngrams import scipy.sparse import gc import pickle as pkl from datetime import datetime as dt import dask.dataframe as dd import tornado import tqdm import xgboost import nltk import dask # In[2]: #imported all the bytes files # root_path = '/Users/yasha/Desktop/Li-phy/Bytes/fuso63yW7wAXO20gJIBS.bytes' # root_path = '/Users/yasha/Desktop/Li-phy/Bytes/' # # #get_sha256_hash(root_path) # # file_open = open(root_path,"rb") # # print(file_open.read()) # # hex_representation = ' '.join(f'{byte:02X}' for byte in file_open.read()) # # print(hex_representation) # # file_open.close() # #file_open.close() # if os.path.isdir(root_path): # data_files = os.listdir(root_path) # print(root_path) # for file in data_files: # #print(file) # path_now=root_path+file # file_open = open(path_now,"rb") # #print(file_open.read()) # # hex_representation = ' '.join(f'{byte:02X}' for byte in file_open.read()) # # print(hex_representation) # # file_open.close() #from injected files let us analyze how much % malware coverage do we have for each class. #data_classification = pd.read_csv('/Users/yasha/Desktop/Li-phy/trainLabels.csv') Y=pd.read_csv("trainLabels.csv") total = len(Y)*1. ax=sns.countplot(x="Class", data=Y) for p in ax.patches: ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5)) #put 11 ticks (therefore 10 steps), from 0 to the total number of rows in the dataframe ax.yaxis.set_ticks(np.linspace(0, total, 11)) #adjust the ticklabel to the desired format, without changing the position of the ticks. ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total)) plt.show() # In[3]: #For each of the class I have calculated the size. # data_files = os.listdir(root_path) # for file in data_files: # path_now=root_path+file # files=open(path_now,"rb") # files=os.listdir("/Users/yasha/Desktop/malware/Bytes/") # filenames=Y['Id'].tolist() # class_y=Y['Class'].tolist() # class_bytes=[] # sizebytes=[] # fnames=[] # for file in files: # # print(os.stat('byteFiles/0A32eTdBKayjCWhZqDOQ.txt')) # # os.stat_result(st_mode=33206, st_ino=1125899906874507, st_dev=3561571700, st_nlink=1, st_uid=0, st_gid=0, # # st_size=3680109, st_atime=1519638522, st_mtime=1519638522, st_ctime=1519638522) # # read more about os.stat: here https://www.tutorialspoint.com/python/os_stat.htm # statinfo=os.stat("/Users/yasha/Desktop/malware/BYTES/"+file) # #print(statinfo) # # split the file name at '.' and take the first part of it i.e the file name # file=file.split('.')[0] # if any(file == filename for filename in filenames): # i=filenames.index(file) # print(i) # class_bytes.append(class_y[i]) # # converting into Mb's # sizebytes.append(statinfo.st_size/(1024.0*1024.0)) # fnames.append(file) # data_size_byte=pd.DataFrame({'ID':fnames,'size':sizebytes,'Class':class_bytes}) # print (data_size_byte) import os import pandas as pd # Assuming Y is a DataFrame that you already have # Y = pd.read_csv('your_file.csv') # Example, if you need to load Y files = os.listdir("BYTES-train") filenames = Y['Id'].tolist() class_y = Y['Class'].tolist() class_bytes = [] sizebytes = [] fnames = [] for file in files: file_name_no_ext = file.split('.')[0] file_path = os.path.join("BYTES-train", file) try: statinfo = os.stat(file_path) except FileNotFoundError: print(f"File not found: {file_path}") continue if file_name_no_ext in filenames: i = filenames.index(file_name_no_ext) print(f"File matched: {file_name_no_ext} at index {i}") class_bytes.append(class_y[i]) sizebytes.append(statinfo.st_size / (1024.0 * 1024.0)) # converting size to MB fnames.append(file_name_no_ext) else: print(f"No match found for file: {file_name_no_ext} ") data_size_byte = pd.DataFrame({'ID': fnames, 'size': sizebytes, 'Class': class_bytes}) # In[4]: #box plot of file size (.bytes file) ax = sns.boxplot(x="Class", y="size", data=data_size_byte) plt.title("boxplot of .bytes file sizes") plt.show() # In[5]: #Extracting unigram of byte files files = os.listdir("BYTES-train") filenames=[] array=[] for file in files: if(file.endswith("bytes")): file=file.split('.')[0] text_file = open('BYTES-train/'+file+".txt", 'w+') with open('BYTES-train/'+file+".bytes","r") as fp: lines="" for line in fp: a=line.rstrip().split(" ")[1:] b=' '.join(a) b=b+"\n" text_file.write(b) fp.close() os.remove('BYTES-train/'+file+".bytes") text_file.close() files = os.listdir('BYTES-train') filenames2=[] feature_matrix = np.zeros((len(files),257),dtype=int) k=0 byte_feature_file=open('result.csv','w+') byte_feature_file.write("ID,0,1,2,3,4,5,6,7,8,9,0a,0b,0c,0d,0e,0f,10,11,12,13,14,15,16,17,18,19,1a,1b,1c,1d,1e,1f,20,21,22,23,24,25,26,27,28,29,2a,2b,2c,2d,2e,2f,30,31,32,33,34,35,36,37,38,39,3a,3b,3c,3d,3e,3f,40,41,42,43,44,45,46,47,48,49,4a,4b,4c,4d,4e,4f,50,51,52,53,54,55,56,57,58,59,5a,5b,5c,5d,5e,5f,60,61,62,63,64,65,66,67,68,69,6a,6b,6c,6d,6e,6f,70,71,72,73,74,75,76,77,78,79,7a,7b,7c,7d,7e,7f,80,81,82,83,84,85,86,87,88,89,8a,8b,8c,8d,8e,8f,90,91,92,93,94,95,96,97,98,99,9a,9b,9c,9d,9e,9f,a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,aa,ab,ac,ad,ae,af,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,da,db,dc,dd,de,df,e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,ea,eb,ec,ed,ee,ef,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,fa,fb,fc,fd,fe,ff,??") byte_feature_file.write("\n") for file in files: filenames2.append(file) byte_feature_file.write(file+",") if(file.endswith("txt")): with open('BYTES-train/'+file,"r") as byte_flie: for lines in byte_flie: line=lines.rstrip().split(" ") for hex_code in line: if hex_code=='??': feature_matrix[k][256]+=1 else: try: feature_matrix[k][int(hex_code,16)]+=1 except Exception as e: print(f"Error Occured @ {file} - {e}") byte_flie.close() for i, row in enumerate(feature_matrix[k]): if i!=len(feature_matrix[k])-1: byte_feature_file.write(str(row)+",") else: byte_feature_file.write(str(row)) byte_feature_file.write("\n") k += 1 byte_feature_file.close() # # Paths # input_dir = "/Users/yasha/Desktop/malware/BYTES-train" # output_csv = '/Users/yasha/Desktop/malware/BYTES-train/result.csv' # # Step 1: Extracting unigrams of byte files # files = [file for file in os.listdir(input_dir) if file.endswith(".bytes")] # for file in files: # base_name = file.split('.')[0] # bytes_path = os.path.join(input_dir, file) # txt_path = os.path.join(input_dir, f"{base_name}.txt") # with open(bytes_path, 'r') as fp, open(txt_path, 'w') as text_file: # for line in fp: # a = line.rstrip().split(" ")[1:] # text_file.write(' '.join(a) + "\n") # os.remove(bytes_path) # Remove the original .bytes file # # Step 2: Compute feature matrix # files_txt = [file for file in os.listdir(input_dir) if file.endswith(".txt")] # num_files = len(files_txt) # num_features = 257 # feature_matrix = np.zeros((num_files, num_features), dtype=int) # # Writing CSV header # header = ",".join(f"{i:02x}" for i in range(num_features)) + ",??" # with open(output_csv, 'w') as byte_feature_file: # byte_feature_file.write(f"ID,{header}\n") # for k, file in enumerate(files_txt): # base_name = file.split('.')[0] # file_path = os.path.join(input_dir, file) # with open(file_path, 'r') as byte_file: # for line in byte_file: # line = line.rstrip().split(" ") # for hex_code in line: # if hex_code == '??': # feature_matrix[k, 256] += 1 # else: # feature_matrix[k, int(hex_code, 16)] += 1 # row = ",".join(map(str, feature_matrix[k])) # Convert row to CSV format # byte_feature_file.write(f"{base_name},{row}\n") # In[6]: byte_features=pd.read_csv("result.csv") print("Original id: ", byte_features['ID'][0]) byte_features['ID'] = byte_features['ID'].str.split('.').str[0] print("byte_Feature: ",byte_features.head(2)) # In[7]: print("byte_size: ", data_size_byte.head(2)) # In[8]: byte_features_with_size = byte_features.merge(data_size_byte, on='ID') byte_features_with_size.to_csv("result_with_size.csv") print("Combined: ", byte_features_with_size.head(2)) # In[9]: def normalize(df): result1 = df.copy() for feature_name in df.columns: if (str(feature_name) != str('ID') and str(feature_name)!=str('Class')): max_value = df[feature_name].max() min_value = df[feature_name].min() result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value) return result1 result = normalize(byte_features_with_size) # In[10]: result.head(2) # In[11]: data_y = result['Class'] result.head() # In[12]: xtsne=TSNE(perplexity=min(50, len(result) -1)) results=xtsne.fit_transform(result.drop(['ID','Class'], axis=1)) vis_x = results[:, 0] vis_y = results[:, 1] plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9)) plt.colorbar(ticks=range(10)) plt.clim(0.5, 9) plt.show() # In[13]: #this is with perplexity 30 xtsne=TSNE(perplexity=30) results=xtsne.fit_transform(result.drop(['ID','Class'], axis=1)) vis_x = results[:, 0] vis_y = results[:, 1] plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9)) plt.colorbar(ticks=range(10)) plt.clim(0.5, 9) plt.show() # In[14]: #this is with perplexity 10 xtsne=TSNE(perplexity=10) results=xtsne.fit_transform(result.drop(['ID','Class'], axis=1)) vis_x = results[:, 0] vis_y = results[:, 1] plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9)) plt.colorbar(ticks=range(10)) plt.clim(0.5, 9) plt.show() # In[15]: data_y = result['Class'] # In[16]: X_train, X_test, y_train, y_test = train_test_split(result.drop(['ID','Class'], axis=1), data_y,stratify=data_y,test_size=0.20) X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train,stratify=y_train,test_size=0.20) # In[17]: print('Number of data points in train data:', X_train.shape[0]) print('Number of data points in test data:', X_test.shape[0]) print('Number of data points in cross validation data:', X_cv.shape[0]) # In[18]: train_class_distribution = y_train.value_counts().sort_values() test_class_distribution = y_test.value_counts().sort_values() cv_class_distribution = y_cv.value_counts().sort_values() my_colors = ['r','g','b','k','y','m','c'] train_class_distribution.plot(kind='bar', color=my_colors) plt.xlabel('Class') plt.ylabel('Data points per Class') plt.title('Distribution of yi in train data') plt.grid() plt.show() # ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html # -(train_class_distribution.values): the minus sign will give us in decreasing order sorted_yi = np.argsort(-train_class_distribution.values) for i in sorted_yi: print('Number of data points in class', i+1, ':',train_class_distribution.values[i], '(', np.round((train_class_distribution.values[i]/y_train.shape[0]*100), 3), '%)') print('-'*80) my_colors = ['r','g','b','k','y','m','c'] test_class_distribution.plot(kind='bar', color=my_colors) plt.xlabel('Class') plt.ylabel('Data points per Class') plt.title('Distribution of yi in test data') plt.grid() plt.show() # ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html # -(train_class_distribution.values): the minus sign will give us in decreasing order sorted_yi = np.argsort(-test_class_distribution.values) for i in sorted_yi: print('Number of data points in class', i+1, ':',test_class_distribution.values[i], '(', np.round((test_class_distribution.values[i]/y_test.shape[0]*100), 3), '%)') print('-'*80) my_colors = ['r','g','b','k','y','m','c'] cv_class_distribution.plot(kind='bar', color=my_colors) plt.xlabel('Class') plt.ylabel('Data points per Class') plt.title('Distribution of yi in cross validation data') plt.grid() plt.show() # ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html # -(train_class_distribution.values): the minus sign will give us in decreasing order sorted_yi = np.argsort(-train_class_distribution.values) for i in sorted_yi: print('Number of data points in class', i+1, ':',cv_class_distribution.values[i], '(', np.round((cv_class_distribution.values[i]/y_cv.shape[0]*100), 3), '%)') # In[19]: def plot_confusion_matrix(test_y, predict_y): C = confusion_matrix(test_y, predict_y) print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100) # C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j A =(((C.T)/(C.sum(axis=1))).T) #divid each element of the confusion matrix with the sum of elements in that column # C = [[1, 2], # [3, 4]] # C.T = [[1, 3], # [2, 4]] # C.sum(axis = 1) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array # C.sum(axix =1) = [[3, 7]] # ((C.T)/(C.sum(axis=1))) = [[1/3, 3/7] # [2/3, 4/7]] # ((C.T)/(C.sum(axis=1))).T = [[1/3, 2/3] # [3/7, 4/7]] # sum of row elements = 1 B =(C/C.sum(axis=0)) #divid each element of the confusion matrix with the sum of elements in that row # C = [[1, 2], # [3, 4]] # C.sum(axis = 0) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array # C.sum(axix =0) = [[4, 6]] # (C/C.sum(axis=0)) = [[1/4, 2/6], # [3/4, 4/6]] labels = [1,2,3,4,5,6,7,8,9] cmap=sns.light_palette("green") # representing A in heatmap format print("-"*50, "Confusion matrix", "-"*50) plt.figure(figsize=(10,5)) sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.show() print("-"*50, "Precision matrix", "-"*50) plt.figure(figsize=(10,5)) sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.show() print("Sum of columns in precision matrix",B.sum(axis=0)) # representing B in heatmap format print("-"*50, "Recall matrix" , "-"*50) plt.figure(figsize=(10,5)) sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels) plt.xlabel('Predicted Class') plt.ylabel('Original Class') plt.show() print("Sum of rows in precision matrix",A.sum(axis=1)) # In[21]: # we need to generate 9 numbers and the sum of numbers should be 1 # one solution is to genarate 9 numbers and divide each of the numbers by their sum # ref: https://stackoverflow.com/a/18662466/4084039 # test_data_len = X_test.shape[0] # cv_data_len = X_cv.shape[0] # # we create a output array that has exactly same size as the CV data # cv_predicted_y = np.zeros((cv_data_len,9)) # for i in range(cv_data_len): # rand_probs = np.random.rand(1,9) # cv_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0]) # print("Log loss on Cross Validation Data using Random Model",log_loss(y_cv,cv_predicted_y, eps=1e-15)) # # Test-Set error. # #we create a output array that has exactly same as the test data # test_predicted_y = np.zeros((test_data_len,9)) # for i in range(test_data_len): # rand_probs = np.random.rand(1,9) # test_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0]) # print("Log loss on Test Data using Random Model",log_loss(y_test,test_predicted_y, eps=1e-15)) # predicted_y =np.argmax(test_predicted_y, axis=1) # plot_confusion_matrix(y_test, predicted_y+1) import numpy as np from sklearn.metrics import log_loss, confusion_matrix import matplotlib.pyplot as plt test_data_len = X_test.shape[0] cv_data_len = X_cv.shape[0] # Create an output array that has the same size as the CV data cv_predicted_y = np.zeros((cv_data_len, 9)) for i in range(cv_data_len): rand_probs = np.random.rand(1, 9) cv_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0]) # Compute log loss on cross-validation data print("Log loss on Cross Validation Data using Random Model", log_loss(y_cv, cv_predicted_y)) # Test-Set error # Create an output array that has the same size as the test data test_predicted_y = np.zeros((test_data_len, 9)) for i in range(test_data_len): rand_probs = np.random.rand(1, 9) test_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0]) # Compute log loss on test data print("Log loss on Test Data using Random Model", log_loss(y_test, test_predicted_y)) # Plot confusion matrix predicted_y = np.argmax(test_predicted_y, axis=1) conf_matrix = confusion_matrix(y_test, predicted_y + 1) plt.matshow(conf_matrix, cmap=plt.cm.Blues) plt.title('Confusion Matrix') plt.colorbar() plt.ylabel('Actual') plt.xlabel('Predicted') plt.show() # In[23]: # find more about KNeighborsClassifier() here http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html # ------------------------- # default parameter # KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, # metric=’minkowski’, metric_params=None, n_jobs=1, **kwargs) # methods of # fit(X, y) : Fit the model using X as training data and y as target values # predict(X):Predict the class labels for the provided data # predict_proba(X):Return probability estimates for the test data X. # find more about CalibratedClassifierCV here at http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html # ---------------------------- # default paramters # sklearn.calibration.CalibratedClassifierCV(base_estimator=None, method=’sigmoid’, cv=3) # # some of the methods of CalibratedClassifierCV() # fit(X, y[, sample_weight]) Fit the calibrated model # get_params([deep]) Get parameters for this estimator. # predict(X) Predict the target of new samples. # predict_proba(X) Posterior probabilities of classification # alpha = [x for x in range(1, 15, 2)] # cv_log_error_array=[] # for i in alpha: # k_cfl=KNeighborsClassifier(n_neighbors=i) # k_cfl.fit(X_train,y_train) # sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid") # sig_clf.fit(X_train, y_train) # predict_y = sig_clf.predict_proba(X_cv) # cv_log_error_array.append(log_loss(y_cv, predict_y, labels=k_cfl.classes_, eps=1e-15)) # for i in range(len(cv_log_error_array)): # print ('log_loss for k = ',alpha[i],'is',cv_log_error_array[i]) # best_alpha = np.argmin(cv_log_error_array) # fig, ax = plt.subplots() # ax.plot(alpha, cv_log_error_array,c='g') # for i, txt in enumerate(np.round(cv_log_error_array,3)): # ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) # plt.grid() # plt.title("Cross Validation Error for each alpha") # plt.xlabel("Alpha i's") # plt.ylabel("Error measure") # plt.show() # k_cfl=KNeighborsClassifier(n_neighbors=alpha[best_alpha]) # k_cfl.fit(X_train,y_train) # sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid") # sig_clf.fit(X_train, y_train) # predict_y = sig_clf.predict_proba(X_train) # print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y)) # predict_y = sig_clf.predict_proba(X_cv) # print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y)) # predict_y = sig_clf.predict_proba(X_test) # print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y)) # plot_confusion_matrix(y_test, sig_clf.predict(X_test)) import numpy as np from sklearn.neighbors import KNeighborsClassifier from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import log_loss, confusion_matrix import matplotlib.pyplot as plt alpha = [x for x in range(1, 15, 2)] cv_log_error_array = [] for i in alpha: k_cfl = KNeighborsClassifier(n_neighbors=i) k_cfl.fit(X_train, y_train) sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid") sig_clf.fit(X_train, y_train) predict_y = sig_clf.predict_proba(X_cv) # Remove eps parameter cv_log_error_array.append(log_loss(y_cv, predict_y, labels=k_cfl.classes_)) for i in range(len(cv_log_error_array)): print('log_loss for k = ', alpha[i], 'is', cv_log_error_array[i]) best_alpha = np.argmin(cv_log_error_array) fig, ax = plt.subplots() ax.plot(alpha, cv_log_error_array, c='g') for i, txt in enumerate(np.round(cv_log_error_array, 3)): ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() k_cfl = KNeighborsClassifier(n_neighbors=alpha[best_alpha]) k_cfl.fit(X_train, y_train) sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid") sig_clf.fit(X_train, y_train) predict_y = sig_clf.predict_proba(X_train) print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:", log_loss(y_train, predict_y)) predict_y = sig_clf.predict_proba(X_cv) print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:", log_loss(y_cv, predict_y)) predict_y = sig_clf.predict_proba(X_test) print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:", log_loss(y_test, predict_y)) # Plot confusion matrix conf_matrix = confusion_matrix(y_test, sig_clf.predict(X_test)) plt.matshow(conf_matrix, cmap=plt.cm.Blues) plt.title('Confusion Matrix') plt.colorbar() plt.ylabel('Actual') plt.xlabel('Predicted') plt.show() with open('models/KNeighborsClassifier.pkl', 'wb') as model_file: pickle.dump(sig_clf, model_file) # In[1]: # read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html # ------------------------------ # default parameters # SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, # shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5, # class_weight=None, warm_start=False, average=False, n_iter=None) # some of methods # fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent. # predict(X) Predict class labels for samples in X. # alpha = [10 ** x for x in range(-5, 4)] # cv_log_error_array=[] # for i in alpha: # logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced') # logisticR.fit(X_train,y_train) # sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid") # sig_clf.fit(X_train, y_train) # predict_y = sig_clf.predict_proba(X_cv) # cv_log_error_array.append(log_loss(y_cv, predict_y, labels=logisticR.classes_, eps=1e-15)) # for i in range(len(cv_log_error_array)): # print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i]) # best_alpha = np.argmin(cv_log_error_array) # fig, ax = plt.subplots() # ax.plot(alpha, cv_log_error_array,c='g') # for i, txt in enumerate(np.round(cv_log_error_array,3)): # ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) # plt.grid() # plt.title("Cross Validation Error for each alpha") # plt.xlabel("Alpha i's") # plt.ylabel("Error measure") # plt.show() # logisticR=LogisticRegression(penalty='l2',C=alpha[best_alpha],class_weight='balanced') # logisticR.fit(X_train,y_train) # sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid") # sig_clf.fit(X_train, y_train) # pred_y=sig_clf.predict(X_test) # predict_y = sig_clf.predict_proba(X_train) # print ('log loss for train data',log_loss(y_train, predict_y, labels=logisticR.classes_, eps=1e-15)) # predict_y = sig_clf.predict_proba(X_cv) # print ('log loss for cv data',log_loss(y_cv, predict_y, labels=logisticR.classes_, eps=1e-15)) # predict_y = sig_clf.predict_proba(X_test) # print ('log loss for test data',log_loss(y_test, predict_y, labels=logisticR.classes_, eps=1e-15)) # plot_confusion_matrix(y_test, sig_clf.predict(X_test)) import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import log_loss, confusion_matrix import matplotlib.pyplot as plt alpha = [10 ** x for x in range(-5, 4)] cv_log_error_array = [] for i in alpha: logisticR = LogisticRegression(penalty='l2', C=i, class_weight='balanced') logisticR.fit(X_train, y_train) sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid") sig_clf.fit(X_train, y_train) predict_y = sig_clf.predict_proba(X_cv) # Remove eps parameter cv_log_error_array.append(log_loss(y_cv, predict_y, labels=logisticR.classes_)) for i in range(len(cv_log_error_array)): print('log_loss for c = ', alpha[i], 'is', cv_log_error_array[i]) best_alpha = np.argmin(cv_log_error_array) fig, ax = plt.subplots() ax.plot(alpha, cv_log_error_array, c='g') for i, txt in enumerate(np.round(cv_log_error_array, 3)): ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() logisticR = LogisticRegression(penalty='l2', C=alpha[best_alpha], class_weight='balanced') logisticR.fit(X_train, y_train) sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid") sig_clf.fit(X_train, y_train) pred_y = sig_clf.predict(X_test) predict_y = sig_clf.predict_proba(X_train) print('log loss for train data', log_loss(y_train, predict_y, labels=logisticR.classes_)) predict_y = sig_clf.predict_proba(X_cv) print('log loss for cv data', log_loss(y_cv, predict_y, labels=logisticR.classes_)) predict_y = sig_clf.predict_proba(X_test) print('log loss for test data', log_loss(y_test, predict_y, labels=logisticR.classes_)) # Plot confusion matrix conf_matrix = confusion_matrix(y_test, sig_clf.predict(X_test)) plt.matshow(conf_matrix, cmap=plt.cm.Blues) plt.title('Confusion Matrix') plt.colorbar() plt.ylabel('Actual') plt.xlabel('Predicted') plt.show() with open('models/SGDClassifier.pkl', 'wb') as model_file: pickle.dump(sig_clf, model_file) # In[2]: # # -------------------------------- # # default parameters # # sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, min_samples_split=2, # # min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, # # min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, # # class_weight=None) # # Some of methods of RandomForestClassifier() # # fit(X, y, [sample_weight]) Fit the SVM model according to the given training data. # # predict(X) Perform classification on samples in X. # # predict_proba (X) Perform classification on samples in X. # # some of attributes of RandomForestClassifier() # # feature_importances_ : array of shape = [n_features] # # The feature importances (the higher, the more important the feature). # alpha=[10,50,100,500,1000,2000,3000] # cv_log_error_array=[] # train_log_error_array=[] # from sklearn.ensemble import RandomForestClassifier # for i in alpha: # r_cfl=RandomForestClassifier(n_estimators=i,random_state=42,n_jobs=-1) # r_cfl.fit(X_train,y_train) # sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid") # sig_clf.fit(X_train, y_train) # predict_y = sig_clf.predict_proba(X_cv) # cv_log_error_array.append(log_loss(y_cv, predict_y, labels=r_cfl.classes_, eps=1e-15)) # for i in range(len(cv_log_error_array)): # print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i]) # best_alpha = np.argmin(cv_log_error_array) # fig, ax = plt.subplots() # ax.plot(alpha, cv_log_error_array,c='g') # for i, txt in enumerate(np.round(cv_log_error_array,3)): # ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) # plt.grid() # plt.title("Cross Validation Error for each alpha") # plt.xlabel("Alpha i's") # plt.ylabel("Error measure") # plt.show() # r_cfl=RandomForestClassifier(n_estimators=alpha[best_alpha],random_state=42,n_jobs=-1) # r_cfl.fit(X_train,y_train) # sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid") # sig_clf.fit(X_train, y_train) # predict_y = sig_clf.predict_proba(X_train) # print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y)) # predict_y = sig_clf.predict_proba(X_cv) # print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y)) # predict_y = sig_clf.predict_proba(X_test) # print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y)) # plot_confusion_matrix(y_test, sig_clf.predict(X_test)) import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import log_loss, confusion_matrix import matplotlib.pyplot as plt alpha = [10, 50, 100, 500, 1000, 2000, 3000] cv_log_error_array = [] train_log_error_array = [] for i in alpha: r_cfl = RandomForestClassifier(n_estimators=i, random_state=42, n_jobs=-1) r_cfl.fit(X_train, y_train) sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid") sig_clf.fit(X_train, y_train) predict_y = sig_clf.predict_proba(X_cv) # Remove eps parameter cv_log_error_array.append(log_loss(y_cv, predict_y, labels=r_cfl.classes_)) for i in range(len(cv_log_error_array)): print('log_loss for c = ', alpha[i], 'is', cv_log_error_array[i]) best_alpha = np.argmin(cv_log_error_array) fig, ax = plt.subplots() ax.plot(alpha, cv_log_error_array, c='g') for i, txt in enumerate(np.round(cv_log_error_array, 3)): ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() r_cfl = RandomForestClassifier(n_estimators=alpha[best_alpha], random_state=42, n_jobs=-1) r_cfl.fit(X_train, y_train) sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid") sig_clf.fit(X_train, y_train) predict_y = sig_clf.predict_proba(X_train) print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:", log_loss(y_train, predict_y)) predict_y = sig_clf.predict_proba(X_cv) print('For values of best alpha = ', alpha[best_alpha], "The cross-validation log loss is:", log_loss(y_cv, predict_y)) predict_y = sig_clf.predict_proba(X_test) print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:", log_loss(y_test, predict_y)) # Plot confusion matrix conf_matrix = confusion_matrix(y_test, sig_clf.predict(X_test)) plt.matshow(conf_matrix, cmap=plt.cm.Blues) plt.title('Confusion Matrix') plt.colorbar() plt.ylabel('Actual') plt.xlabel('Predicted') plt.show() with open('models/RandomForestClassifier.pkl', 'wb') as model_file: pickle.dump(sig_clf, model_file) # In[3]: # # Training a hyper-parameter tuned Xg-Boost regressor on our train data # # find more about XGBClassifier function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#xgboost.XGBClassifier # # ------------------------- # # default paramters # # class xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, # # objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, # # max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, # # scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs) # # some of methods of RandomForestRegressor() # # fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None) # # get_params([deep]) Get parameters for this estimator. # # predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe. # # get_score(importance_type='weight') -> get the feature importance # alpha=[10,50,100,500,1000,2000] # cv_log_error_array=[] # for i in alpha: # x_cfl=XGBClassifier(n_estimators=i,nthread=-1) # x_cfl.fit(X_train,y_train) # sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid") # sig_clf.fit(X_train, y_train) # predict_y = sig_clf.predict_proba(X_cv) # cv_log_error_array.append(log_loss(y_cv, predict_y, labels=x_cfl.classes_, eps=1e-15)) # for i in range(len(cv_log_error_array)): # print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i]) # best_alpha = np.argmin(cv_log_error_array) # fig, ax = plt.subplots() # ax.plot(alpha, cv_log_error_array,c='g') # for i, txt in enumerate(np.round(cv_log_error_array,3)): # ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i])) # plt.grid() # plt.title("Cross Validation Error for each alpha") # plt.xlabel("Alpha i's") # plt.ylabel("Error measure") # plt.show() # x_cfl=XGBClassifier(n_estimators=alpha[best_alpha],nthread=-1) # x_cfl.fit(X_train,y_train) # sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid") # sig_clf.fit(X_train, y_train) # predict_y = sig_clf.predict_proba(X_train) # print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y)) # predict_y = sig_clf.predict_proba(X_cv) # print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y)) # predict_y = sig_clf.predict_proba(X_test) # print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y)) # plot_confusion_matrix(y_test, sig_clf.predict(X_test)) import numpy as np import matplotlib.pyplot as plt from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import log_loss, confusion_matrix from xgboost import XGBClassifier # Adjust your class labels to start from 0 y_train_adjusted = y_train - 1 y_cv_adjusted = y_cv - 1 y_test_adjusted = y_test - 1 alpha = [10, 50, 100, 500, 1000, 2000] cv_log_error_array = [] for i in alpha: x_cfl = XGBClassifier(n_estimators=i, nthread=-1) x_cfl.fit(X_train, y_train_adjusted) sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid") sig_clf.fit(X_train, y_train_adjusted) predict_y = sig_clf.predict_proba(X_cv) cv_log_error_array.append(log_loss(y_cv_adjusted, predict_y, labels=x_cfl.classes_)) for i in range(len(cv_log_error_array)): print('log_loss for c = ', alpha[i], 'is', cv_log_error_array[i]) best_alpha = np.argmin(cv_log_error_array) fig, ax = plt.subplots() ax.plot(alpha, cv_log_error_array, c='g') for i, txt in enumerate(np.round(cv_log_error_array, 3)): ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() x_cfl = XGBClassifier(n_estimators=alpha[best_alpha], nthread=-1) x_cfl.fit(X_train, y_train_adjusted) sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid") sig_clf.fit(X_train, y_train_adjusted) predict_y = sig_clf.predict_proba(X_train) print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:", log_loss(y_train_adjusted, predict_y)) predict_y = sig_clf.predict_proba(X_cv) print('For values of best alpha = ', alpha[best_alpha], "The cross-validation log loss is:", log_loss(y_cv_adjusted, predict_y)) predict_y = sig_clf.predict_proba(X_test) print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:", log_loss(y_test_adjusted, predict_y)) # Plot confusion matrix conf_matrix = confusion_matrix(y_test_adjusted, sig_clf.predict(X_test)) plt.matshow(conf_matrix, cmap=plt.cm.Blues) plt.title('Confusion Matrix') plt.colorbar() plt.ylabel('Actual') plt.xlabel('Predicted') plt.show() with open('models/XGBClassifier.pkl', 'wb') as model_file: pickle.dump(sig_clf, model_file) # In[4]: # # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ # x_cfl=XGBClassifier() # prams={ # 'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2], # 'n_estimators':[100,200,500,1000,2000], # 'max_depth':[3,5,10], # 'colsample_bytree':[0.1,0.3,0.5,1], # 'subsample':[0.1,0.3,0.5,1] # } # random_cfl1=RandomizedSearchCV(x_cfl,param_distributions=prams,verbose=10,n_jobs=-1,) # random_cfl1.fit(X_train,y_train) from xgboost import XGBClassifier from sklearn.model_selection import RandomizedSearchCV # Adjust your class labels to start from 0 y_train_adjusted = y_train - 1 y_cv_adjusted = y_cv - 1 y_test_adjusted = y_test - 1 x_cfl = XGBClassifier() params = { 'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2], 'n_estimators': [100, 200, 500, 1000, 2000], 'max_depth': [3, 5, 10], 'colsample_bytree': [0.1, 0.3, 0.5, 1], 'subsample': [0.1, 0.3, 0.5, 1] } random_cfl1 = RandomizedSearchCV(x_cfl, param_distributions=params, verbose=10, n_jobs=-1) random_cfl1.fit(X_train, y_train_adjusted) print(f"Best Parameters: {random_cfl1.best_params_}") print(f"Best Score: {random_cfl1.best_score_}")