1110 lines
38 KiB
Python
1110 lines
38 KiB
Python
#!/usr/bin/env python
|
||
# coding: utf-8
|
||
|
||
# In[1]:
|
||
|
||
|
||
import warnings
|
||
warnings.filterwarnings("ignore")
|
||
|
||
import shutil
|
||
import os
|
||
import pandas as pd
|
||
|
||
import matplotlib
|
||
matplotlib.use('nbAgg')
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
|
||
import numpy as np
|
||
from tqdm import tqdm
|
||
|
||
import pickle
|
||
|
||
from sklearn.manifold import TSNE
|
||
from sklearn import preprocessing
|
||
from sklearn.model_selection import RandomizedSearchCV, train_test_split
|
||
from sklearn.tree import DecisionTreeClassifier
|
||
from sklearn.calibration import CalibratedClassifierCV
|
||
from sklearn.neighbors import KNeighborsClassifier
|
||
from sklearn.metrics import log_loss, confusion_matrix
|
||
from sklearn.linear_model import LogisticRegression
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.feature_selection import SelectKBest, chi2, f_regression
|
||
|
||
import re
|
||
from nltk.util import ngrams
|
||
|
||
import scipy.sparse
|
||
import gc
|
||
import pickle as pkl
|
||
from datetime import datetime as dt
|
||
import dask.dataframe as dd
|
||
|
||
import tornado
|
||
import tqdm
|
||
import xgboost
|
||
import nltk
|
||
import dask
|
||
|
||
|
||
# In[2]:
|
||
|
||
|
||
#imported all the bytes files
|
||
# root_path = '/Users/yasha/Desktop/Li-phy/Bytes/fuso63yW7wAXO20gJIBS.bytes'
|
||
# root_path = '/Users/yasha/Desktop/Li-phy/Bytes/'
|
||
# # #get_sha256_hash(root_path)
|
||
# # file_open = open(root_path,"rb")
|
||
# # print(file_open.read())
|
||
# # hex_representation = ' '.join(f'{byte:02X}' for byte in file_open.read())
|
||
# # print(hex_representation)
|
||
# # file_open.close()
|
||
# #file_open.close()
|
||
# if os.path.isdir(root_path):
|
||
# data_files = os.listdir(root_path)
|
||
# print(root_path)
|
||
# for file in data_files:
|
||
# #print(file)
|
||
# path_now=root_path+file
|
||
# file_open = open(path_now,"rb")
|
||
# #print(file_open.read())
|
||
# # hex_representation = ' '.join(f'{byte:02X}' for byte in file_open.read())
|
||
# # print(hex_representation)
|
||
# # file_open.close()
|
||
|
||
#from injected files let us analyze how much % malware coverage do we have for each class.
|
||
#data_classification = pd.read_csv('/Users/yasha/Desktop/Li-phy/trainLabels.csv')
|
||
Y=pd.read_csv("trainLabels.csv")
|
||
total = len(Y)*1.
|
||
ax=sns.countplot(x="Class", data=Y)
|
||
for p in ax.patches:
|
||
ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
|
||
|
||
#put 11 ticks (therefore 10 steps), from 0 to the total number of rows in the dataframe
|
||
ax.yaxis.set_ticks(np.linspace(0, total, 11))
|
||
|
||
#adjust the ticklabel to the desired format, without changing the position of the ticks.
|
||
ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
|
||
plt.show()
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# In[3]:
|
||
|
||
|
||
#For each of the class I have calculated the size.
|
||
|
||
# data_files = os.listdir(root_path)
|
||
# for file in data_files:
|
||
# path_now=root_path+file
|
||
# files=open(path_now,"rb")
|
||
|
||
# files=os.listdir("/Users/yasha/Desktop/malware/Bytes/")
|
||
# filenames=Y['Id'].tolist()
|
||
# class_y=Y['Class'].tolist()
|
||
# class_bytes=[]
|
||
# sizebytes=[]
|
||
# fnames=[]
|
||
# for file in files:
|
||
# # print(os.stat('byteFiles/0A32eTdBKayjCWhZqDOQ.txt'))
|
||
# # os.stat_result(st_mode=33206, st_ino=1125899906874507, st_dev=3561571700, st_nlink=1, st_uid=0, st_gid=0,
|
||
# # st_size=3680109, st_atime=1519638522, st_mtime=1519638522, st_ctime=1519638522)
|
||
# # read more about os.stat: here https://www.tutorialspoint.com/python/os_stat.htm
|
||
# statinfo=os.stat("/Users/yasha/Desktop/malware/BYTES/"+file)
|
||
# #print(statinfo)
|
||
# # split the file name at '.' and take the first part of it i.e the file name
|
||
# file=file.split('.')[0]
|
||
|
||
# if any(file == filename for filename in filenames):
|
||
# i=filenames.index(file)
|
||
# print(i)
|
||
# class_bytes.append(class_y[i])
|
||
# # converting into Mb's
|
||
# sizebytes.append(statinfo.st_size/(1024.0*1024.0))
|
||
# fnames.append(file)
|
||
# data_size_byte=pd.DataFrame({'ID':fnames,'size':sizebytes,'Class':class_bytes})
|
||
# print (data_size_byte)
|
||
|
||
|
||
import os
|
||
import pandas as pd
|
||
|
||
# Assuming Y is a DataFrame that you already have
|
||
# Y = pd.read_csv('your_file.csv') # Example, if you need to load Y
|
||
|
||
files = os.listdir("BYTES-train")
|
||
filenames = Y['Id'].tolist()
|
||
class_y = Y['Class'].tolist()
|
||
class_bytes = []
|
||
sizebytes = []
|
||
fnames = []
|
||
|
||
|
||
for file in files:
|
||
file_name_no_ext = file.split('.')[0]
|
||
file_path = os.path.join("BYTES-train", file)
|
||
|
||
try:
|
||
statinfo = os.stat(file_path)
|
||
except FileNotFoundError:
|
||
print(f"File not found: {file_path}")
|
||
continue
|
||
|
||
if file_name_no_ext in filenames:
|
||
i = filenames.index(file_name_no_ext)
|
||
print(f"File matched: {file_name_no_ext} at index {i}")
|
||
class_bytes.append(class_y[i])
|
||
sizebytes.append(statinfo.st_size / (1024.0 * 1024.0)) # converting size to MB
|
||
fnames.append(file_name_no_ext)
|
||
else:
|
||
print(f"No match found for file: {file_name_no_ext} ")
|
||
|
||
data_size_byte = pd.DataFrame({'ID': fnames, 'size': sizebytes, 'Class': class_bytes})
|
||
|
||
|
||
|
||
|
||
# In[4]:
|
||
|
||
|
||
#box plot of file size (.bytes file)
|
||
ax = sns.boxplot(x="Class", y="size", data=data_size_byte)
|
||
plt.title("boxplot of .bytes file sizes")
|
||
plt.show()
|
||
|
||
|
||
# In[5]:
|
||
|
||
|
||
#Extracting unigram of byte files
|
||
files = os.listdir("BYTES-train")
|
||
filenames=[]
|
||
array=[]
|
||
for file in files:
|
||
if(file.endswith("bytes")):
|
||
file=file.split('.')[0]
|
||
text_file = open('BYTES-train/'+file+".txt", 'w+')
|
||
with open('BYTES-train/'+file+".bytes","r") as fp:
|
||
lines=""
|
||
for line in fp:
|
||
a=line.rstrip().split(" ")[1:]
|
||
b=' '.join(a)
|
||
b=b+"\n"
|
||
text_file.write(b)
|
||
fp.close()
|
||
os.remove('BYTES-train/'+file+".bytes")
|
||
text_file.close()
|
||
|
||
files = os.listdir('BYTES-train')
|
||
filenames2=[]
|
||
feature_matrix = np.zeros((len(files),257),dtype=int)
|
||
k=0
|
||
|
||
byte_feature_file=open('result.csv','w+')
|
||
byte_feature_file.write("ID,0,1,2,3,4,5,6,7,8,9,0a,0b,0c,0d,0e,0f,10,11,12,13,14,15,16,17,18,19,1a,1b,1c,1d,1e,1f,20,21,22,23,24,25,26,27,28,29,2a,2b,2c,2d,2e,2f,30,31,32,33,34,35,36,37,38,39,3a,3b,3c,3d,3e,3f,40,41,42,43,44,45,46,47,48,49,4a,4b,4c,4d,4e,4f,50,51,52,53,54,55,56,57,58,59,5a,5b,5c,5d,5e,5f,60,61,62,63,64,65,66,67,68,69,6a,6b,6c,6d,6e,6f,70,71,72,73,74,75,76,77,78,79,7a,7b,7c,7d,7e,7f,80,81,82,83,84,85,86,87,88,89,8a,8b,8c,8d,8e,8f,90,91,92,93,94,95,96,97,98,99,9a,9b,9c,9d,9e,9f,a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,aa,ab,ac,ad,ae,af,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,da,db,dc,dd,de,df,e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,ea,eb,ec,ed,ee,ef,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,fa,fb,fc,fd,fe,ff,??")
|
||
|
||
byte_feature_file.write("\n")
|
||
|
||
for file in files:
|
||
filenames2.append(file)
|
||
byte_feature_file.write(file+",")
|
||
if(file.endswith("txt")):
|
||
with open('BYTES-train/'+file,"r") as byte_flie:
|
||
for lines in byte_flie:
|
||
line=lines.rstrip().split(" ")
|
||
for hex_code in line:
|
||
if hex_code=='??':
|
||
feature_matrix[k][256]+=1
|
||
else:
|
||
try:
|
||
feature_matrix[k][int(hex_code,16)]+=1
|
||
except Exception as e:
|
||
print(f"Error Occured @ {file} - {e}")
|
||
byte_flie.close()
|
||
for i, row in enumerate(feature_matrix[k]):
|
||
if i!=len(feature_matrix[k])-1:
|
||
byte_feature_file.write(str(row)+",")
|
||
else:
|
||
byte_feature_file.write(str(row))
|
||
byte_feature_file.write("\n")
|
||
|
||
k += 1
|
||
|
||
byte_feature_file.close()
|
||
|
||
|
||
|
||
# # Paths
|
||
# input_dir = "/Users/yasha/Desktop/malware/BYTES-train"
|
||
# output_csv = '/Users/yasha/Desktop/malware/BYTES-train/result.csv'
|
||
|
||
# # Step 1: Extracting unigrams of byte files
|
||
# files = [file for file in os.listdir(input_dir) if file.endswith(".bytes")]
|
||
|
||
# for file in files:
|
||
# base_name = file.split('.')[0]
|
||
# bytes_path = os.path.join(input_dir, file)
|
||
# txt_path = os.path.join(input_dir, f"{base_name}.txt")
|
||
|
||
# with open(bytes_path, 'r') as fp, open(txt_path, 'w') as text_file:
|
||
# for line in fp:
|
||
# a = line.rstrip().split(" ")[1:]
|
||
# text_file.write(' '.join(a) + "\n")
|
||
|
||
# os.remove(bytes_path) # Remove the original .bytes file
|
||
|
||
# # Step 2: Compute feature matrix
|
||
# files_txt = [file for file in os.listdir(input_dir) if file.endswith(".txt")]
|
||
# num_files = len(files_txt)
|
||
# num_features = 257
|
||
# feature_matrix = np.zeros((num_files, num_features), dtype=int)
|
||
|
||
# # Writing CSV header
|
||
# header = ",".join(f"{i:02x}" for i in range(num_features)) + ",??"
|
||
# with open(output_csv, 'w') as byte_feature_file:
|
||
# byte_feature_file.write(f"ID,{header}\n")
|
||
|
||
# for k, file in enumerate(files_txt):
|
||
# base_name = file.split('.')[0]
|
||
# file_path = os.path.join(input_dir, file)
|
||
|
||
# with open(file_path, 'r') as byte_file:
|
||
# for line in byte_file:
|
||
# line = line.rstrip().split(" ")
|
||
# for hex_code in line:
|
||
# if hex_code == '??':
|
||
# feature_matrix[k, 256] += 1
|
||
# else:
|
||
# feature_matrix[k, int(hex_code, 16)] += 1
|
||
|
||
# row = ",".join(map(str, feature_matrix[k])) # Convert row to CSV format
|
||
# byte_feature_file.write(f"{base_name},{row}\n")
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# In[6]:
|
||
|
||
|
||
byte_features=pd.read_csv("result.csv")
|
||
print("Original id: ", byte_features['ID'][0])
|
||
byte_features['ID'] = byte_features['ID'].str.split('.').str[0]
|
||
print("byte_Feature: ",byte_features.head(2))
|
||
|
||
|
||
# In[7]:
|
||
|
||
|
||
print("byte_size: ", data_size_byte.head(2))
|
||
|
||
|
||
# In[8]:
|
||
|
||
|
||
byte_features_with_size = byte_features.merge(data_size_byte, on='ID')
|
||
byte_features_with_size.to_csv("result_with_size.csv")
|
||
print("Combined: ", byte_features_with_size.head(2))
|
||
|
||
|
||
# In[9]:
|
||
|
||
|
||
def normalize(df):
|
||
result1 = df.copy()
|
||
for feature_name in df.columns:
|
||
if (str(feature_name) != str('ID') and str(feature_name)!=str('Class')):
|
||
max_value = df[feature_name].max()
|
||
min_value = df[feature_name].min()
|
||
result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
|
||
return result1
|
||
|
||
result = normalize(byte_features_with_size)
|
||
|
||
|
||
# In[10]:
|
||
|
||
|
||
result.head(2)
|
||
|
||
|
||
# In[11]:
|
||
|
||
|
||
data_y = result['Class']
|
||
result.head()
|
||
|
||
|
||
# In[12]:
|
||
|
||
|
||
xtsne=TSNE(perplexity=min(50, len(result) -1))
|
||
results=xtsne.fit_transform(result.drop(['ID','Class'], axis=1))
|
||
vis_x = results[:, 0]
|
||
vis_y = results[:, 1]
|
||
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
|
||
plt.colorbar(ticks=range(10))
|
||
plt.clim(0.5, 9)
|
||
plt.show()
|
||
|
||
|
||
# In[13]:
|
||
|
||
|
||
#this is with perplexity 30
|
||
xtsne=TSNE(perplexity=30)
|
||
results=xtsne.fit_transform(result.drop(['ID','Class'], axis=1))
|
||
vis_x = results[:, 0]
|
||
vis_y = results[:, 1]
|
||
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
|
||
plt.colorbar(ticks=range(10))
|
||
plt.clim(0.5, 9)
|
||
plt.show()
|
||
|
||
|
||
# In[14]:
|
||
|
||
|
||
#this is with perplexity 10
|
||
xtsne=TSNE(perplexity=10)
|
||
results=xtsne.fit_transform(result.drop(['ID','Class'], axis=1))
|
||
vis_x = results[:, 0]
|
||
vis_y = results[:, 1]
|
||
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
|
||
plt.colorbar(ticks=range(10))
|
||
plt.clim(0.5, 9)
|
||
plt.show()
|
||
|
||
|
||
# In[15]:
|
||
|
||
|
||
data_y = result['Class']
|
||
|
||
|
||
# In[16]:
|
||
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(result.drop(['ID','Class'], axis=1), data_y,stratify=data_y,test_size=0.20)
|
||
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train,stratify=y_train,test_size=0.20)
|
||
|
||
|
||
# In[17]:
|
||
|
||
|
||
print('Number of data points in train data:', X_train.shape[0])
|
||
print('Number of data points in test data:', X_test.shape[0])
|
||
print('Number of data points in cross validation data:', X_cv.shape[0])
|
||
|
||
|
||
# In[18]:
|
||
|
||
|
||
train_class_distribution = y_train.value_counts().sort_values()
|
||
test_class_distribution = y_test.value_counts().sort_values()
|
||
cv_class_distribution = y_cv.value_counts().sort_values()
|
||
|
||
my_colors = ['r','g','b','k','y','m','c']
|
||
train_class_distribution.plot(kind='bar', color=my_colors)
|
||
plt.xlabel('Class')
|
||
plt.ylabel('Data points per Class')
|
||
plt.title('Distribution of yi in train data')
|
||
plt.grid()
|
||
plt.show()
|
||
|
||
# ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
|
||
# -(train_class_distribution.values): the minus sign will give us in decreasing order
|
||
sorted_yi = np.argsort(-train_class_distribution.values)
|
||
for i in sorted_yi:
|
||
print('Number of data points in class', i+1, ':',train_class_distribution.values[i], '(', np.round((train_class_distribution.values[i]/y_train.shape[0]*100), 3), '%)')
|
||
|
||
|
||
print('-'*80)
|
||
my_colors = ['r','g','b','k','y','m','c']
|
||
test_class_distribution.plot(kind='bar', color=my_colors)
|
||
plt.xlabel('Class')
|
||
plt.ylabel('Data points per Class')
|
||
plt.title('Distribution of yi in test data')
|
||
plt.grid()
|
||
plt.show()
|
||
|
||
# ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
|
||
# -(train_class_distribution.values): the minus sign will give us in decreasing order
|
||
sorted_yi = np.argsort(-test_class_distribution.values)
|
||
for i in sorted_yi:
|
||
print('Number of data points in class', i+1, ':',test_class_distribution.values[i], '(', np.round((test_class_distribution.values[i]/y_test.shape[0]*100), 3), '%)')
|
||
|
||
print('-'*80)
|
||
my_colors = ['r','g','b','k','y','m','c']
|
||
cv_class_distribution.plot(kind='bar', color=my_colors)
|
||
plt.xlabel('Class')
|
||
plt.ylabel('Data points per Class')
|
||
plt.title('Distribution of yi in cross validation data')
|
||
plt.grid()
|
||
plt.show()
|
||
|
||
# ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
|
||
# -(train_class_distribution.values): the minus sign will give us in decreasing order
|
||
sorted_yi = np.argsort(-train_class_distribution.values)
|
||
for i in sorted_yi:
|
||
print('Number of data points in class', i+1, ':',cv_class_distribution.values[i], '(', np.round((cv_class_distribution.values[i]/y_cv.shape[0]*100), 3), '%)')
|
||
|
||
|
||
# In[19]:
|
||
|
||
|
||
def plot_confusion_matrix(test_y, predict_y):
|
||
C = confusion_matrix(test_y, predict_y)
|
||
print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100)
|
||
# C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j
|
||
|
||
A =(((C.T)/(C.sum(axis=1))).T)
|
||
#divid each element of the confusion matrix with the sum of elements in that column
|
||
|
||
# C = [[1, 2],
|
||
# [3, 4]]
|
||
# C.T = [[1, 3],
|
||
# [2, 4]]
|
||
# C.sum(axis = 1) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
|
||
# C.sum(axix =1) = [[3, 7]]
|
||
# ((C.T)/(C.sum(axis=1))) = [[1/3, 3/7]
|
||
# [2/3, 4/7]]
|
||
|
||
# ((C.T)/(C.sum(axis=1))).T = [[1/3, 2/3]
|
||
# [3/7, 4/7]]
|
||
# sum of row elements = 1
|
||
|
||
B =(C/C.sum(axis=0))
|
||
#divid each element of the confusion matrix with the sum of elements in that row
|
||
# C = [[1, 2],
|
||
# [3, 4]]
|
||
# C.sum(axis = 0) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
|
||
# C.sum(axix =0) = [[4, 6]]
|
||
# (C/C.sum(axis=0)) = [[1/4, 2/6],
|
||
# [3/4, 4/6]]
|
||
|
||
labels = [1,2,3,4,5,6,7,8,9]
|
||
cmap=sns.light_palette("green")
|
||
# representing A in heatmap format
|
||
print("-"*50, "Confusion matrix", "-"*50)
|
||
plt.figure(figsize=(10,5))
|
||
sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
|
||
plt.xlabel('Predicted Class')
|
||
plt.ylabel('Original Class')
|
||
plt.show()
|
||
|
||
print("-"*50, "Precision matrix", "-"*50)
|
||
plt.figure(figsize=(10,5))
|
||
sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
|
||
plt.xlabel('Predicted Class')
|
||
plt.ylabel('Original Class')
|
||
plt.show()
|
||
print("Sum of columns in precision matrix",B.sum(axis=0))
|
||
|
||
# representing B in heatmap format
|
||
print("-"*50, "Recall matrix" , "-"*50)
|
||
plt.figure(figsize=(10,5))
|
||
sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
|
||
plt.xlabel('Predicted Class')
|
||
plt.ylabel('Original Class')
|
||
plt.show()
|
||
print("Sum of rows in precision matrix",A.sum(axis=1))
|
||
|
||
|
||
# In[21]:
|
||
|
||
|
||
# we need to generate 9 numbers and the sum of numbers should be 1
|
||
# one solution is to genarate 9 numbers and divide each of the numbers by their sum
|
||
# ref: https://stackoverflow.com/a/18662466/4084039
|
||
|
||
# test_data_len = X_test.shape[0]
|
||
# cv_data_len = X_cv.shape[0]
|
||
|
||
# # we create a output array that has exactly same size as the CV data
|
||
# cv_predicted_y = np.zeros((cv_data_len,9))
|
||
# for i in range(cv_data_len):
|
||
# rand_probs = np.random.rand(1,9)
|
||
# cv_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
|
||
# print("Log loss on Cross Validation Data using Random Model",log_loss(y_cv,cv_predicted_y, eps=1e-15))
|
||
|
||
|
||
# # Test-Set error.
|
||
# #we create a output array that has exactly same as the test data
|
||
# test_predicted_y = np.zeros((test_data_len,9))
|
||
# for i in range(test_data_len):
|
||
# rand_probs = np.random.rand(1,9)
|
||
# test_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
|
||
# print("Log loss on Test Data using Random Model",log_loss(y_test,test_predicted_y, eps=1e-15))
|
||
|
||
# predicted_y =np.argmax(test_predicted_y, axis=1)
|
||
# plot_confusion_matrix(y_test, predicted_y+1)
|
||
|
||
|
||
import numpy as np
|
||
from sklearn.metrics import log_loss, confusion_matrix
|
||
import matplotlib.pyplot as plt
|
||
|
||
test_data_len = X_test.shape[0]
|
||
cv_data_len = X_cv.shape[0]
|
||
|
||
# Create an output array that has the same size as the CV data
|
||
cv_predicted_y = np.zeros((cv_data_len, 9))
|
||
for i in range(cv_data_len):
|
||
rand_probs = np.random.rand(1, 9)
|
||
cv_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0])
|
||
|
||
# Compute log loss on cross-validation data
|
||
print("Log loss on Cross Validation Data using Random Model", log_loss(y_cv, cv_predicted_y))
|
||
|
||
# Test-Set error
|
||
# Create an output array that has the same size as the test data
|
||
test_predicted_y = np.zeros((test_data_len, 9))
|
||
for i in range(test_data_len):
|
||
rand_probs = np.random.rand(1, 9)
|
||
test_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0])
|
||
|
||
# Compute log loss on test data
|
||
print("Log loss on Test Data using Random Model", log_loss(y_test, test_predicted_y))
|
||
|
||
# Plot confusion matrix
|
||
predicted_y = np.argmax(test_predicted_y, axis=1)
|
||
conf_matrix = confusion_matrix(y_test, predicted_y + 1)
|
||
|
||
plt.matshow(conf_matrix, cmap=plt.cm.Blues)
|
||
plt.title('Confusion Matrix')
|
||
plt.colorbar()
|
||
plt.ylabel('Actual')
|
||
plt.xlabel('Predicted')
|
||
plt.show()
|
||
|
||
|
||
# In[23]:
|
||
|
||
|
||
# find more about KNeighborsClassifier() here http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
|
||
# -------------------------
|
||
# default parameter
|
||
# KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2,
|
||
# metric=’minkowski’, metric_params=None, n_jobs=1, **kwargs)
|
||
|
||
# methods of
|
||
# fit(X, y) : Fit the model using X as training data and y as target values
|
||
# predict(X):Predict the class labels for the provided data
|
||
# predict_proba(X):Return probability estimates for the test data X.
|
||
|
||
# find more about CalibratedClassifierCV here at http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html
|
||
# ----------------------------
|
||
# default paramters
|
||
# sklearn.calibration.CalibratedClassifierCV(base_estimator=None, method=’sigmoid’, cv=3)
|
||
#
|
||
# some of the methods of CalibratedClassifierCV()
|
||
# fit(X, y[, sample_weight]) Fit the calibrated model
|
||
# get_params([deep]) Get parameters for this estimator.
|
||
# predict(X) Predict the target of new samples.
|
||
# predict_proba(X) Posterior probabilities of classification
|
||
|
||
|
||
# alpha = [x for x in range(1, 15, 2)]
|
||
# cv_log_error_array=[]
|
||
# for i in alpha:
|
||
# k_cfl=KNeighborsClassifier(n_neighbors=i)
|
||
# k_cfl.fit(X_train,y_train)
|
||
# sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
|
||
# sig_clf.fit(X_train, y_train)
|
||
# predict_y = sig_clf.predict_proba(X_cv)
|
||
# cv_log_error_array.append(log_loss(y_cv, predict_y, labels=k_cfl.classes_, eps=1e-15))
|
||
|
||
# for i in range(len(cv_log_error_array)):
|
||
# print ('log_loss for k = ',alpha[i],'is',cv_log_error_array[i])
|
||
|
||
# best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
# fig, ax = plt.subplots()
|
||
# ax.plot(alpha, cv_log_error_array,c='g')
|
||
# for i, txt in enumerate(np.round(cv_log_error_array,3)):
|
||
# ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
|
||
# plt.grid()
|
||
# plt.title("Cross Validation Error for each alpha")
|
||
# plt.xlabel("Alpha i's")
|
||
# plt.ylabel("Error measure")
|
||
# plt.show()
|
||
|
||
# k_cfl=KNeighborsClassifier(n_neighbors=alpha[best_alpha])
|
||
# k_cfl.fit(X_train,y_train)
|
||
# sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
|
||
# sig_clf.fit(X_train, y_train)
|
||
|
||
# predict_y = sig_clf.predict_proba(X_train)
|
||
# print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y))
|
||
# predict_y = sig_clf.predict_proba(X_cv)
|
||
# print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y))
|
||
# predict_y = sig_clf.predict_proba(X_test)
|
||
# print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y))
|
||
# plot_confusion_matrix(y_test, sig_clf.predict(X_test))
|
||
|
||
|
||
import numpy as np
|
||
from sklearn.neighbors import KNeighborsClassifier
|
||
from sklearn.calibration import CalibratedClassifierCV
|
||
from sklearn.metrics import log_loss, confusion_matrix
|
||
import matplotlib.pyplot as plt
|
||
|
||
alpha = [x for x in range(1, 15, 2)]
|
||
cv_log_error_array = []
|
||
|
||
for i in alpha:
|
||
k_cfl = KNeighborsClassifier(n_neighbors=i)
|
||
k_cfl.fit(X_train, y_train)
|
||
sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train, y_train)
|
||
predict_y = sig_clf.predict_proba(X_cv)
|
||
# Remove eps parameter
|
||
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=k_cfl.classes_))
|
||
|
||
for i in range(len(cv_log_error_array)):
|
||
print('log_loss for k = ', alpha[i], 'is', cv_log_error_array[i])
|
||
|
||
best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
fig, ax = plt.subplots()
|
||
ax.plot(alpha, cv_log_error_array, c='g')
|
||
for i, txt in enumerate(np.round(cv_log_error_array, 3)):
|
||
ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i]))
|
||
plt.grid()
|
||
plt.title("Cross Validation Error for each alpha")
|
||
plt.xlabel("Alpha i's")
|
||
plt.ylabel("Error measure")
|
||
plt.show()
|
||
|
||
k_cfl = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
|
||
k_cfl.fit(X_train, y_train)
|
||
sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train, y_train)
|
||
|
||
predict_y = sig_clf.predict_proba(X_train)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:", log_loss(y_train, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_cv)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:", log_loss(y_cv, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_test)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:", log_loss(y_test, predict_y))
|
||
|
||
# Plot confusion matrix
|
||
conf_matrix = confusion_matrix(y_test, sig_clf.predict(X_test))
|
||
|
||
plt.matshow(conf_matrix, cmap=plt.cm.Blues)
|
||
plt.title('Confusion Matrix')
|
||
plt.colorbar()
|
||
plt.ylabel('Actual')
|
||
plt.xlabel('Predicted')
|
||
plt.show()
|
||
with open('models/KNeighborsClassifier.pkl', 'wb') as model_file:
|
||
pickle.dump(sig_clf, model_file)
|
||
|
||
|
||
|
||
# In[1]:
|
||
|
||
|
||
# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
|
||
# ------------------------------
|
||
# default parameters
|
||
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
|
||
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5,
|
||
# class_weight=None, warm_start=False, average=False, n_iter=None)
|
||
|
||
# some of methods
|
||
# fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent.
|
||
# predict(X) Predict class labels for samples in X.
|
||
|
||
|
||
# alpha = [10 ** x for x in range(-5, 4)]
|
||
# cv_log_error_array=[]
|
||
# for i in alpha:
|
||
# logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced')
|
||
# logisticR.fit(X_train,y_train)
|
||
# sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
|
||
# sig_clf.fit(X_train, y_train)
|
||
# predict_y = sig_clf.predict_proba(X_cv)
|
||
# cv_log_error_array.append(log_loss(y_cv, predict_y, labels=logisticR.classes_, eps=1e-15))
|
||
|
||
# for i in range(len(cv_log_error_array)):
|
||
# print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
|
||
|
||
# best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
# fig, ax = plt.subplots()
|
||
# ax.plot(alpha, cv_log_error_array,c='g')
|
||
# for i, txt in enumerate(np.round(cv_log_error_array,3)):
|
||
# ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
|
||
# plt.grid()
|
||
# plt.title("Cross Validation Error for each alpha")
|
||
# plt.xlabel("Alpha i's")
|
||
# plt.ylabel("Error measure")
|
||
# plt.show()
|
||
|
||
# logisticR=LogisticRegression(penalty='l2',C=alpha[best_alpha],class_weight='balanced')
|
||
# logisticR.fit(X_train,y_train)
|
||
# sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
|
||
# sig_clf.fit(X_train, y_train)
|
||
# pred_y=sig_clf.predict(X_test)
|
||
|
||
# predict_y = sig_clf.predict_proba(X_train)
|
||
# print ('log loss for train data',log_loss(y_train, predict_y, labels=logisticR.classes_, eps=1e-15))
|
||
# predict_y = sig_clf.predict_proba(X_cv)
|
||
# print ('log loss for cv data',log_loss(y_cv, predict_y, labels=logisticR.classes_, eps=1e-15))
|
||
# predict_y = sig_clf.predict_proba(X_test)
|
||
# print ('log loss for test data',log_loss(y_test, predict_y, labels=logisticR.classes_, eps=1e-15))
|
||
# plot_confusion_matrix(y_test, sig_clf.predict(X_test))
|
||
|
||
|
||
import numpy as np
|
||
from sklearn.linear_model import LogisticRegression
|
||
from sklearn.calibration import CalibratedClassifierCV
|
||
from sklearn.metrics import log_loss, confusion_matrix
|
||
import matplotlib.pyplot as plt
|
||
|
||
alpha = [10 ** x for x in range(-5, 4)]
|
||
cv_log_error_array = []
|
||
|
||
for i in alpha:
|
||
logisticR = LogisticRegression(penalty='l2', C=i, class_weight='balanced')
|
||
logisticR.fit(X_train, y_train)
|
||
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
|
||
sig_clf.fit(X_train, y_train)
|
||
predict_y = sig_clf.predict_proba(X_cv)
|
||
# Remove eps parameter
|
||
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=logisticR.classes_))
|
||
|
||
for i in range(len(cv_log_error_array)):
|
||
print('log_loss for c = ', alpha[i], 'is', cv_log_error_array[i])
|
||
|
||
best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
fig, ax = plt.subplots()
|
||
ax.plot(alpha, cv_log_error_array, c='g')
|
||
for i, txt in enumerate(np.round(cv_log_error_array, 3)):
|
||
ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i]))
|
||
plt.grid()
|
||
plt.title("Cross Validation Error for each alpha")
|
||
plt.xlabel("Alpha i's")
|
||
plt.ylabel("Error measure")
|
||
plt.show()
|
||
|
||
logisticR = LogisticRegression(penalty='l2', C=alpha[best_alpha], class_weight='balanced')
|
||
logisticR.fit(X_train, y_train)
|
||
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
|
||
sig_clf.fit(X_train, y_train)
|
||
pred_y = sig_clf.predict(X_test)
|
||
|
||
predict_y = sig_clf.predict_proba(X_train)
|
||
print('log loss for train data', log_loss(y_train, predict_y, labels=logisticR.classes_))
|
||
predict_y = sig_clf.predict_proba(X_cv)
|
||
print('log loss for cv data', log_loss(y_cv, predict_y, labels=logisticR.classes_))
|
||
predict_y = sig_clf.predict_proba(X_test)
|
||
print('log loss for test data', log_loss(y_test, predict_y, labels=logisticR.classes_))
|
||
|
||
# Plot confusion matrix
|
||
conf_matrix = confusion_matrix(y_test, sig_clf.predict(X_test))
|
||
|
||
plt.matshow(conf_matrix, cmap=plt.cm.Blues)
|
||
plt.title('Confusion Matrix')
|
||
plt.colorbar()
|
||
plt.ylabel('Actual')
|
||
plt.xlabel('Predicted')
|
||
plt.show()
|
||
|
||
with open('models/SGDClassifier.pkl', 'wb') as model_file:
|
||
pickle.dump(sig_clf, model_file)
|
||
|
||
|
||
# In[2]:
|
||
|
||
|
||
# # --------------------------------
|
||
# # default parameters
|
||
# # sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, min_samples_split=2,
|
||
# # min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0,
|
||
# # min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False,
|
||
# # class_weight=None)
|
||
|
||
# # Some of methods of RandomForestClassifier()
|
||
# # fit(X, y, [sample_weight]) Fit the SVM model according to the given training data.
|
||
# # predict(X) Perform classification on samples in X.
|
||
# # predict_proba (X) Perform classification on samples in X.
|
||
|
||
# # some of attributes of RandomForestClassifier()
|
||
# # feature_importances_ : array of shape = [n_features]
|
||
# # The feature importances (the higher, the more important the feature).
|
||
|
||
# alpha=[10,50,100,500,1000,2000,3000]
|
||
# cv_log_error_array=[]
|
||
# train_log_error_array=[]
|
||
# from sklearn.ensemble import RandomForestClassifier
|
||
# for i in alpha:
|
||
# r_cfl=RandomForestClassifier(n_estimators=i,random_state=42,n_jobs=-1)
|
||
# r_cfl.fit(X_train,y_train)
|
||
# sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
|
||
# sig_clf.fit(X_train, y_train)
|
||
# predict_y = sig_clf.predict_proba(X_cv)
|
||
# cv_log_error_array.append(log_loss(y_cv, predict_y, labels=r_cfl.classes_, eps=1e-15))
|
||
|
||
# for i in range(len(cv_log_error_array)):
|
||
# print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
|
||
|
||
|
||
# best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
# fig, ax = plt.subplots()
|
||
# ax.plot(alpha, cv_log_error_array,c='g')
|
||
# for i, txt in enumerate(np.round(cv_log_error_array,3)):
|
||
# ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
|
||
# plt.grid()
|
||
# plt.title("Cross Validation Error for each alpha")
|
||
# plt.xlabel("Alpha i's")
|
||
# plt.ylabel("Error measure")
|
||
# plt.show()
|
||
|
||
|
||
# r_cfl=RandomForestClassifier(n_estimators=alpha[best_alpha],random_state=42,n_jobs=-1)
|
||
# r_cfl.fit(X_train,y_train)
|
||
# sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
|
||
# sig_clf.fit(X_train, y_train)
|
||
|
||
# predict_y = sig_clf.predict_proba(X_train)
|
||
# print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y))
|
||
# predict_y = sig_clf.predict_proba(X_cv)
|
||
# print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y))
|
||
# predict_y = sig_clf.predict_proba(X_test)
|
||
# print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y))
|
||
# plot_confusion_matrix(y_test, sig_clf.predict(X_test))
|
||
|
||
import numpy as np
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.calibration import CalibratedClassifierCV
|
||
from sklearn.metrics import log_loss, confusion_matrix
|
||
import matplotlib.pyplot as plt
|
||
|
||
alpha = [10, 50, 100, 500, 1000, 2000, 3000]
|
||
cv_log_error_array = []
|
||
train_log_error_array = []
|
||
|
||
for i in alpha:
|
||
r_cfl = RandomForestClassifier(n_estimators=i, random_state=42, n_jobs=-1)
|
||
r_cfl.fit(X_train, y_train)
|
||
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train, y_train)
|
||
predict_y = sig_clf.predict_proba(X_cv)
|
||
# Remove eps parameter
|
||
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=r_cfl.classes_))
|
||
|
||
for i in range(len(cv_log_error_array)):
|
||
print('log_loss for c = ', alpha[i], 'is', cv_log_error_array[i])
|
||
|
||
best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
fig, ax = plt.subplots()
|
||
ax.plot(alpha, cv_log_error_array, c='g')
|
||
for i, txt in enumerate(np.round(cv_log_error_array, 3)):
|
||
ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i]))
|
||
plt.grid()
|
||
plt.title("Cross Validation Error for each alpha")
|
||
plt.xlabel("Alpha i's")
|
||
plt.ylabel("Error measure")
|
||
plt.show()
|
||
|
||
r_cfl = RandomForestClassifier(n_estimators=alpha[best_alpha], random_state=42, n_jobs=-1)
|
||
r_cfl.fit(X_train, y_train)
|
||
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train, y_train)
|
||
|
||
predict_y = sig_clf.predict_proba(X_train)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:", log_loss(y_train, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_cv)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The cross-validation log loss is:", log_loss(y_cv, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_test)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:", log_loss(y_test, predict_y))
|
||
|
||
# Plot confusion matrix
|
||
conf_matrix = confusion_matrix(y_test, sig_clf.predict(X_test))
|
||
|
||
plt.matshow(conf_matrix, cmap=plt.cm.Blues)
|
||
plt.title('Confusion Matrix')
|
||
plt.colorbar()
|
||
plt.ylabel('Actual')
|
||
plt.xlabel('Predicted')
|
||
plt.show()
|
||
|
||
with open('models/RandomForestClassifier.pkl', 'wb') as model_file:
|
||
pickle.dump(sig_clf, model_file)
|
||
|
||
|
||
# In[3]:
|
||
|
||
|
||
# # Training a hyper-parameter tuned Xg-Boost regressor on our train data
|
||
|
||
# # find more about XGBClassifier function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#xgboost.XGBClassifier
|
||
# # -------------------------
|
||
# # default paramters
|
||
# # class xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True,
|
||
# # objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
|
||
# # max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1,
|
||
# # scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs)
|
||
|
||
# # some of methods of RandomForestRegressor()
|
||
# # fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None)
|
||
# # get_params([deep]) Get parameters for this estimator.
|
||
# # predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe.
|
||
# # get_score(importance_type='weight') -> get the feature importance
|
||
|
||
|
||
# alpha=[10,50,100,500,1000,2000]
|
||
# cv_log_error_array=[]
|
||
# for i in alpha:
|
||
# x_cfl=XGBClassifier(n_estimators=i,nthread=-1)
|
||
# x_cfl.fit(X_train,y_train)
|
||
# sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
|
||
# sig_clf.fit(X_train, y_train)
|
||
# predict_y = sig_clf.predict_proba(X_cv)
|
||
# cv_log_error_array.append(log_loss(y_cv, predict_y, labels=x_cfl.classes_, eps=1e-15))
|
||
|
||
# for i in range(len(cv_log_error_array)):
|
||
# print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
|
||
# best_alpha = np.argmin(cv_log_error_array)
|
||
# fig, ax = plt.subplots()
|
||
# ax.plot(alpha, cv_log_error_array,c='g')
|
||
# for i, txt in enumerate(np.round(cv_log_error_array,3)):
|
||
# ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
|
||
# plt.grid()
|
||
# plt.title("Cross Validation Error for each alpha")
|
||
# plt.xlabel("Alpha i's")
|
||
# plt.ylabel("Error measure")
|
||
# plt.show()
|
||
# x_cfl=XGBClassifier(n_estimators=alpha[best_alpha],nthread=-1)
|
||
# x_cfl.fit(X_train,y_train)
|
||
# sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
|
||
# sig_clf.fit(X_train, y_train)
|
||
# predict_y = sig_clf.predict_proba(X_train)
|
||
# print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y))
|
||
# predict_y = sig_clf.predict_proba(X_cv)
|
||
# print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y))
|
||
# predict_y = sig_clf.predict_proba(X_test)
|
||
# print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y))
|
||
# plot_confusion_matrix(y_test, sig_clf.predict(X_test))
|
||
|
||
|
||
|
||
import numpy as np
|
||
import matplotlib.pyplot as plt
|
||
from sklearn.calibration import CalibratedClassifierCV
|
||
from sklearn.metrics import log_loss, confusion_matrix
|
||
from xgboost import XGBClassifier
|
||
|
||
# Adjust your class labels to start from 0
|
||
y_train_adjusted = y_train - 1
|
||
y_cv_adjusted = y_cv - 1
|
||
y_test_adjusted = y_test - 1
|
||
|
||
alpha = [10, 50, 100, 500, 1000, 2000]
|
||
cv_log_error_array = []
|
||
|
||
for i in alpha:
|
||
x_cfl = XGBClassifier(n_estimators=i, nthread=-1)
|
||
x_cfl.fit(X_train, y_train_adjusted)
|
||
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train, y_train_adjusted)
|
||
predict_y = sig_clf.predict_proba(X_cv)
|
||
cv_log_error_array.append(log_loss(y_cv_adjusted, predict_y, labels=x_cfl.classes_))
|
||
|
||
for i in range(len(cv_log_error_array)):
|
||
print('log_loss for c = ', alpha[i], 'is', cv_log_error_array[i])
|
||
|
||
best_alpha = np.argmin(cv_log_error_array)
|
||
|
||
fig, ax = plt.subplots()
|
||
ax.plot(alpha, cv_log_error_array, c='g')
|
||
for i, txt in enumerate(np.round(cv_log_error_array, 3)):
|
||
ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i]))
|
||
plt.grid()
|
||
plt.title("Cross Validation Error for each alpha")
|
||
plt.xlabel("Alpha i's")
|
||
plt.ylabel("Error measure")
|
||
plt.show()
|
||
|
||
x_cfl = XGBClassifier(n_estimators=alpha[best_alpha], nthread=-1)
|
||
x_cfl.fit(X_train, y_train_adjusted)
|
||
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
|
||
sig_clf.fit(X_train, y_train_adjusted)
|
||
|
||
predict_y = sig_clf.predict_proba(X_train)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:", log_loss(y_train_adjusted, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_cv)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The cross-validation log loss is:", log_loss(y_cv_adjusted, predict_y))
|
||
predict_y = sig_clf.predict_proba(X_test)
|
||
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:", log_loss(y_test_adjusted, predict_y))
|
||
|
||
# Plot confusion matrix
|
||
conf_matrix = confusion_matrix(y_test_adjusted, sig_clf.predict(X_test))
|
||
|
||
plt.matshow(conf_matrix, cmap=plt.cm.Blues)
|
||
plt.title('Confusion Matrix')
|
||
plt.colorbar()
|
||
plt.ylabel('Actual')
|
||
plt.xlabel('Predicted')
|
||
plt.show()
|
||
with open('models/XGBClassifier.pkl', 'wb') as model_file:
|
||
pickle.dump(sig_clf, model_file)
|
||
|
||
|
||
# In[4]:
|
||
|
||
|
||
# # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
|
||
# x_cfl=XGBClassifier()
|
||
|
||
# prams={
|
||
# 'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2],
|
||
# 'n_estimators':[100,200,500,1000,2000],
|
||
# 'max_depth':[3,5,10],
|
||
# 'colsample_bytree':[0.1,0.3,0.5,1],
|
||
# 'subsample':[0.1,0.3,0.5,1]
|
||
# }
|
||
# random_cfl1=RandomizedSearchCV(x_cfl,param_distributions=prams,verbose=10,n_jobs=-1,)
|
||
# random_cfl1.fit(X_train,y_train)
|
||
|
||
|
||
from xgboost import XGBClassifier
|
||
from sklearn.model_selection import RandomizedSearchCV
|
||
|
||
# Adjust your class labels to start from 0
|
||
y_train_adjusted = y_train - 1
|
||
y_cv_adjusted = y_cv - 1
|
||
y_test_adjusted = y_test - 1
|
||
|
||
x_cfl = XGBClassifier()
|
||
|
||
params = {
|
||
'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2],
|
||
'n_estimators': [100, 200, 500, 1000, 2000],
|
||
'max_depth': [3, 5, 10],
|
||
'colsample_bytree': [0.1, 0.3, 0.5, 1],
|
||
'subsample': [0.1, 0.3, 0.5, 1]
|
||
}
|
||
|
||
random_cfl1 = RandomizedSearchCV(x_cfl, param_distributions=params, verbose=10, n_jobs=-1)
|
||
random_cfl1.fit(X_train, y_train_adjusted)
|
||
|
||
print(f"Best Parameters: {random_cfl1.best_params_}")
|
||
print(f"Best Score: {random_cfl1.best_score_}")
|
||
|