Malware_Detection_Ubuntu/Bytes_Model_Generator.py

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import warnings
warnings.filterwarnings("ignore")

import shutil
import os
import pandas as pd

import matplotlib
matplotlib.use('nbAgg')
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
from tqdm import tqdm

import pickle

from sklearn.manifold import TSNE
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_regression

import re
from nltk.util import ngrams

import scipy.sparse
import gc
import pickle as pkl
from datetime import datetime as dt
import dask.dataframe as dd

import tornado
import tqdm
import xgboost
import nltk
import dask


# In[2]:


#imported all the bytes files
# root_path = '/Users/yasha/Desktop/Li-phy/Bytes/fuso63yW7wAXO20gJIBS.bytes'
# root_path = '/Users/yasha/Desktop/Li-phy/Bytes/'
# # #get_sha256_hash(root_path)
# # file_open = open(root_path,"rb")
# # print(file_open.read())
# # hex_representation = ' '.join(f'{byte:02X}' for byte in file_open.read())
# # print(hex_representation)
# # file_open.close()
# #file_open.close()
# if os.path.isdir(root_path):
#      data_files = os.listdir(root_path)
#      print(root_path)
#      for file in data_files:
#          #print(file)
#          path_now=root_path+file
#          file_open = open(path_now,"rb")
#          #print(file_open.read())
# #         hex_representation = ' '.join(f'{byte:02X}' for byte in file_open.read())
# #         print(hex_representation)
# #         file_open.close()

#from injected files let us analyze how much % malware coverage do we have for each class.
#data_classification = pd.read_csv('/Users/yasha/Desktop/Li-phy/trainLabels.csv')
Y=pd.read_csv("trainLabels.csv")
total = len(Y)*1.
ax=sns.countplot(x="Class", data=Y)
for p in ax.patches:
        ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))

#put 11 ticks (therefore 10 steps), from 0 to the total number of rows in the dataframe
ax.yaxis.set_ticks(np.linspace(0, total, 11))

#adjust the ticklabel to the desired format, without changing the position of the ticks.
ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
plt.show()


# In[3]:


#For each of the class I have calculated the size.

# data_files = os.listdir(root_path)
# for file in data_files:
#     path_now=root_path+file
#     files=open(path_now,"rb")

# files=os.listdir("/Users/yasha/Desktop/malware/Bytes/")
# filenames=Y['Id'].tolist()
# class_y=Y['Class'].tolist()
# class_bytes=[]
# sizebytes=[]
# fnames=[]
# for file in files:
#     # print(os.stat('byteFiles/0A32eTdBKayjCWhZqDOQ.txt'))
#     # os.stat_result(st_mode=33206, st_ino=1125899906874507, st_dev=3561571700, st_nlink=1, st_uid=0, st_gid=0,
#     # st_size=3680109, st_atime=1519638522, st_mtime=1519638522, st_ctime=1519638522)
#     # read more about os.stat: here https://www.tutorialspoint.com/python/os_stat.htm
#     statinfo=os.stat("/Users/yasha/Desktop/malware/BYTES/"+file)
#     #print(statinfo)
#     # split the file name at '.' and take the first part of it i.e the file name
#     file=file.split('.')[0]

#     if any(file == filename for filename in filenames):
#         i=filenames.index(file)
#         print(i)
#         class_bytes.append(class_y[i])
#         # converting into Mb's
#         sizebytes.append(statinfo.st_size/(1024.0*1024.0))
#         fnames.append(file)
# data_size_byte=pd.DataFrame({'ID':fnames,'size':sizebytes,'Class':class_bytes})
# print (data_size_byte)


import os
import pandas as pd

# Assuming Y is a DataFrame that you already have
# Y = pd.read_csv('your_file.csv')  # Example, if you need to load Y

files = os.listdir("BYTES-train")
filenames = Y['Id'].tolist()
class_y = Y['Class'].tolist()
class_bytes = []
sizebytes = []
fnames = []


for file in files:
    file_name_no_ext = file.split('.')[0]
    file_path = os.path.join("BYTES-train", file)

    try:
        statinfo = os.stat(file_path)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        continue

    if file_name_no_ext in filenames:
        i = filenames.index(file_name_no_ext)
        print(f"File matched: {file_name_no_ext} at index {i}")
        class_bytes.append(class_y[i])
        sizebytes.append(statinfo.st_size / (1024.0 * 1024.0))  # converting size to MB
        fnames.append(file_name_no_ext)
    else:
        print(f"No match found for file: {file_name_no_ext} ")

data_size_byte = pd.DataFrame({'ID': fnames, 'size': sizebytes, 'Class': class_bytes})


# In[4]:


#box plot of file size (.bytes file)
ax = sns.boxplot(x="Class", y="size", data=data_size_byte)
plt.title("boxplot of .bytes file sizes")
plt.show()


# In[5]:


#Extracting unigram of byte files
files = os.listdir("BYTES-train")
filenames=[]
array=[]
for file in files:
    if(file.endswith("bytes")):
        file=file.split('.')[0]
        text_file = open('BYTES-train/'+file+".txt", 'w+')
        with open('BYTES-train/'+file+".bytes","r") as fp:
            lines=""
            for line in fp:
                a=line.rstrip().split(" ")[1:]
                b=' '.join(a)
                b=b+"\n"
                text_file.write(b)
            fp.close()
            os.remove('BYTES-train/'+file+".bytes")
        text_file.close()

files = os.listdir('BYTES-train')
filenames2=[]
feature_matrix = np.zeros((len(files),257),dtype=int)
k=0

byte_feature_file=open('result.csv','w+')
byte_feature_file.write("ID,0,1,2,3,4,5,6,7,8,9,0a,0b,0c,0d,0e,0f,10,11,12,13,14,15,16,17,18,19,1a,1b,1c,1d,1e,1f,20,21,22,23,24,25,26,27,28,29,2a,2b,2c,2d,2e,2f,30,31,32,33,34,35,36,37,38,39,3a,3b,3c,3d,3e,3f,40,41,42,43,44,45,46,47,48,49,4a,4b,4c,4d,4e,4f,50,51,52,53,54,55,56,57,58,59,5a,5b,5c,5d,5e,5f,60,61,62,63,64,65,66,67,68,69,6a,6b,6c,6d,6e,6f,70,71,72,73,74,75,76,77,78,79,7a,7b,7c,7d,7e,7f,80,81,82,83,84,85,86,87,88,89,8a,8b,8c,8d,8e,8f,90,91,92,93,94,95,96,97,98,99,9a,9b,9c,9d,9e,9f,a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,aa,ab,ac,ad,ae,af,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,ca,cb,cc,cd,ce,cf,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,da,db,dc,dd,de,df,e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,ea,eb,ec,ed,ee,ef,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,fa,fb,fc,fd,fe,ff,??")

byte_feature_file.write("\n")

for file in files:
    filenames2.append(file)
    byte_feature_file.write(file+",")
    if(file.endswith("txt")):
        with open('BYTES-train/'+file,"r") as byte_flie:
            for lines in byte_flie:
                line=lines.rstrip().split(" ")
                for hex_code in line:
                    if hex_code=='??':
                        feature_matrix[k][256]+=1
                    else:
                        try:
                            feature_matrix[k][int(hex_code,16)]+=1
                        except Exception as e:
                            print(f"Error Occured @ {file} - {e}")
        byte_flie.close()
    for i, row in enumerate(feature_matrix[k]):
        if i!=len(feature_matrix[k])-1:
            byte_feature_file.write(str(row)+",")
        else:
            byte_feature_file.write(str(row))
    byte_feature_file.write("\n")

    k += 1

byte_feature_file.close()


# # Paths
# input_dir = "/Users/yasha/Desktop/malware/BYTES-train"
# output_csv = '/Users/yasha/Desktop/malware/BYTES-train/result.csv'

# # Step 1: Extracting unigrams of byte files
# files = [file for file in os.listdir(input_dir) if file.endswith(".bytes")]

# for file in files:
#     base_name = file.split('.')[0]
#     bytes_path = os.path.join(input_dir, file)
#     txt_path = os.path.join(input_dir, f"{base_name}.txt")

#     with open(bytes_path, 'r') as fp, open(txt_path, 'w') as text_file:
#         for line in fp:
#             a = line.rstrip().split(" ")[1:]
#             text_file.write(' '.join(a) + "\n")

#     os.remove(bytes_path)  # Remove the original .bytes file

# # Step 2: Compute feature matrix
# files_txt = [file for file in os.listdir(input_dir) if file.endswith(".txt")]
# num_files = len(files_txt)
# num_features = 257
# feature_matrix = np.zeros((num_files, num_features), dtype=int)

# # Writing CSV header
# header = ",".join(f"{i:02x}" for i in range(num_features)) + ",??"
# with open(output_csv, 'w') as byte_feature_file:
#     byte_feature_file.write(f"ID,{header}\n")

#     for k, file in enumerate(files_txt):
#         base_name = file.split('.')[0]
#         file_path = os.path.join(input_dir, file)

#         with open(file_path, 'r') as byte_file:
#             for line in byte_file:
#                 line = line.rstrip().split(" ")
#                 for hex_code in line:
#                     if hex_code == '??':
#                         feature_matrix[k, 256] += 1
#                     else:
#                         feature_matrix[k, int(hex_code, 16)] += 1

#         row = ",".join(map(str, feature_matrix[k]))  # Convert row to CSV format
#         byte_feature_file.write(f"{base_name},{row}\n")


# In[6]:


byte_features=pd.read_csv("result.csv")
print("Original id: ", byte_features['ID'][0])
byte_features['ID']  = byte_features['ID'].str.split('.').str[0]
print("byte_Feature: ",byte_features.head(2))


# In[7]:


print("byte_size: ", data_size_byte.head(2))


# In[8]:


byte_features_with_size = byte_features.merge(data_size_byte, on='ID')
byte_features_with_size.to_csv("result_with_size.csv")
print("Combined: ", byte_features_with_size.head(2))


# In[9]:


def normalize(df):
    result1 = df.copy()
    for feature_name in df.columns:
        if (str(feature_name) != str('ID') and str(feature_name)!=str('Class')):
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result1

result = normalize(byte_features_with_size)


# In[10]:


result.head(2)


# In[11]:


data_y = result['Class']
result.head()


# In[12]:


xtsne=TSNE(perplexity=min(50, len(result) -1))
results=xtsne.fit_transform(result.drop(['ID','Class'], axis=1))
vis_x = results[:, 0]
vis_y = results[:, 1]
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
plt.colorbar(ticks=range(10))
plt.clim(0.5, 9)
plt.show()


# In[13]:


#this is with perplexity 30
xtsne=TSNE(perplexity=30)
results=xtsne.fit_transform(result.drop(['ID','Class'], axis=1))
vis_x = results[:, 0]
vis_y = results[:, 1]
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
plt.colorbar(ticks=range(10))
plt.clim(0.5, 9)
plt.show()


# In[14]:


#this is with perplexity 10
xtsne=TSNE(perplexity=10)
results=xtsne.fit_transform(result.drop(['ID','Class'], axis=1))
vis_x = results[:, 0]
vis_y = results[:, 1]
plt.scatter(vis_x, vis_y, c=data_y, cmap=plt.cm.get_cmap("jet", 9))
plt.colorbar(ticks=range(10))
plt.clim(0.5, 9)
plt.show()


# In[15]:


data_y = result['Class']


# In[16]:


X_train, X_test, y_train, y_test = train_test_split(result.drop(['ID','Class'], axis=1), data_y,stratify=data_y,test_size=0.20)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train,stratify=y_train,test_size=0.20)


# In[17]:


print('Number of data points in train data:', X_train.shape[0])
print('Number of data points in test data:', X_test.shape[0])
print('Number of data points in cross validation data:', X_cv.shape[0])


# In[18]:


train_class_distribution = y_train.value_counts().sort_values()
test_class_distribution = y_test.value_counts().sort_values()
cv_class_distribution = y_cv.value_counts().sort_values()

my_colors = ['r','g','b','k','y','m','c']
train_class_distribution.plot(kind='bar', color=my_colors)
plt.xlabel('Class')
plt.ylabel('Data points per Class')
plt.title('Distribution of yi in train data')
plt.grid()
plt.show()

# ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
# -(train_class_distribution.values): the minus sign will give us in decreasing order
sorted_yi = np.argsort(-train_class_distribution.values)
for i in sorted_yi:
    print('Number of data points in class', i+1, ':',train_class_distribution.values[i], '(', np.round((train_class_distribution.values[i]/y_train.shape[0]*100), 3), '%)')


print('-'*80)
my_colors = ['r','g','b','k','y','m','c']
test_class_distribution.plot(kind='bar', color=my_colors)
plt.xlabel('Class')
plt.ylabel('Data points per Class')
plt.title('Distribution of yi in test data')
plt.grid()
plt.show()

# ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
# -(train_class_distribution.values): the minus sign will give us in decreasing order
sorted_yi = np.argsort(-test_class_distribution.values)
for i in sorted_yi:
    print('Number of data points in class', i+1, ':',test_class_distribution.values[i], '(', np.round((test_class_distribution.values[i]/y_test.shape[0]*100), 3), '%)')

print('-'*80)
my_colors = ['r','g','b','k','y','m','c']
cv_class_distribution.plot(kind='bar', color=my_colors)
plt.xlabel('Class')
plt.ylabel('Data points per Class')
plt.title('Distribution of yi in cross validation data')
plt.grid()
plt.show()

# ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
# -(train_class_distribution.values): the minus sign will give us in decreasing order
sorted_yi = np.argsort(-train_class_distribution.values)
for i in sorted_yi:
    print('Number of data points in class', i+1, ':',cv_class_distribution.values[i], '(', np.round((cv_class_distribution.values[i]/y_cv.shape[0]*100), 3), '%)')


# In[19]:


def plot_confusion_matrix(test_y, predict_y):
    C = confusion_matrix(test_y, predict_y)
    print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100)
    # C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j

    A =(((C.T)/(C.sum(axis=1))).T)
    #divid each element of the confusion matrix with the sum of elements in that column

    # C = [[1, 2],
    #     [3, 4]]
    # C.T = [[1, 3],
    #        [2, 4]]
    # C.sum(axis = 1)  axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
    # C.sum(axix =1) = [[3, 7]]
    # ((C.T)/(C.sum(axis=1))) = [[1/3, 3/7]
    #                           [2/3, 4/7]]

    # ((C.T)/(C.sum(axis=1))).T = [[1/3, 2/3]
    #                           [3/7, 4/7]]
    # sum of row elements = 1

    B =(C/C.sum(axis=0))
    #divid each element of the confusion matrix with the sum of elements in that row
    # C = [[1, 2],
    #     [3, 4]]
    # C.sum(axis = 0)  axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
    # C.sum(axix =0) = [[4, 6]]
    # (C/C.sum(axis=0)) = [[1/4, 2/6],
    #                      [3/4, 4/6]]

    labels = [1,2,3,4,5,6,7,8,9]
    cmap=sns.light_palette("green")
    # representing A in heatmap format
    print("-"*50, "Confusion matrix", "-"*50)
    plt.figure(figsize=(10,5))
    sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()

    print("-"*50, "Precision matrix", "-"*50)
    plt.figure(figsize=(10,5))
    sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()
    print("Sum of columns in precision matrix",B.sum(axis=0))

    # representing B in heatmap format
    print("-"*50, "Recall matrix"    , "-"*50)
    plt.figure(figsize=(10,5))
    sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()
    print("Sum of rows in precision matrix",A.sum(axis=1))


# In[21]:


# we need to generate 9 numbers and the sum of numbers should be 1
# one solution is to genarate 9 numbers and divide each of the numbers by their sum
# ref: https://stackoverflow.com/a/18662466/4084039

# test_data_len = X_test.shape[0]
# cv_data_len = X_cv.shape[0]

# # we create a output array that has exactly same size as the CV data
# cv_predicted_y = np.zeros((cv_data_len,9))
# for i in range(cv_data_len):
#     rand_probs = np.random.rand(1,9)
#     cv_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
# print("Log loss on Cross Validation Data using Random Model",log_loss(y_cv,cv_predicted_y, eps=1e-15))


# # Test-Set error.
# #we create a output array that has exactly same as the test data
# test_predicted_y = np.zeros((test_data_len,9))
# for i in range(test_data_len):
#     rand_probs = np.random.rand(1,9)
#     test_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
# print("Log loss on Test Data using Random Model",log_loss(y_test,test_predicted_y, eps=1e-15))

# predicted_y =np.argmax(test_predicted_y, axis=1)
# plot_confusion_matrix(y_test, predicted_y+1)


import numpy as np
from sklearn.metrics import log_loss, confusion_matrix
import matplotlib.pyplot as plt

test_data_len = X_test.shape[0]
cv_data_len = X_cv.shape[0]

# Create an output array that has the same size as the CV data
cv_predicted_y = np.zeros((cv_data_len, 9))
for i in range(cv_data_len):
    rand_probs = np.random.rand(1, 9)
    cv_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0])

# Compute log loss on cross-validation data
print("Log loss on Cross Validation Data using Random Model", log_loss(y_cv, cv_predicted_y))

# Test-Set error
# Create an output array that has the same size as the test data
test_predicted_y = np.zeros((test_data_len, 9))
for i in range(test_data_len):
    rand_probs = np.random.rand(1, 9)
    test_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0])

# Compute log loss on test data
print("Log loss on Test Data using Random Model", log_loss(y_test, test_predicted_y))

# Plot confusion matrix
predicted_y = np.argmax(test_predicted_y, axis=1)
conf_matrix = confusion_matrix(y_test, predicted_y + 1)

plt.matshow(conf_matrix, cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


# In[23]:


# find more about KNeighborsClassifier() here http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
# -------------------------
# default parameter
# KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2,
# metric=’minkowski’, metric_params=None, n_jobs=1, **kwargs)

# methods of
# fit(X, y) : Fit the model using X as training data and y as target values
# predict(X):Predict the class labels for the provided data
# predict_proba(X):Return probability estimates for the test data X.

# find more about CalibratedClassifierCV here at http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html
# ----------------------------
# default paramters
# sklearn.calibration.CalibratedClassifierCV(base_estimator=None, method=’sigmoid’, cv=3)
#
# some of the methods of CalibratedClassifierCV()
# fit(X, y[, sample_weight])	Fit the calibrated model
# get_params([deep])	Get parameters for this estimator.
# predict(X)	Predict the target of new samples.
# predict_proba(X)	Posterior probabilities of classification


# alpha = [x for x in range(1, 15, 2)]
# cv_log_error_array=[]
# for i in alpha:
#     k_cfl=KNeighborsClassifier(n_neighbors=i)
#     k_cfl.fit(X_train,y_train)
#     sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
#     sig_clf.fit(X_train, y_train)
#     predict_y = sig_clf.predict_proba(X_cv)
#     cv_log_error_array.append(log_loss(y_cv, predict_y, labels=k_cfl.classes_, eps=1e-15))

# for i in range(len(cv_log_error_array)):
#     print ('log_loss for k = ',alpha[i],'is',cv_log_error_array[i])

# best_alpha = np.argmin(cv_log_error_array)

# fig, ax = plt.subplots()
# ax.plot(alpha, cv_log_error_array,c='g')
# for i, txt in enumerate(np.round(cv_log_error_array,3)):
#     ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
# plt.grid()
# plt.title("Cross Validation Error for each alpha")
# plt.xlabel("Alpha i's")
# plt.ylabel("Error measure")
# plt.show()

# k_cfl=KNeighborsClassifier(n_neighbors=alpha[best_alpha])
# k_cfl.fit(X_train,y_train)
# sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
# sig_clf.fit(X_train, y_train)

# predict_y = sig_clf.predict_proba(X_train)
# print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y))
# predict_y = sig_clf.predict_proba(X_cv)
# print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y))
# predict_y = sig_clf.predict_proba(X_test)
# print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y))
# plot_confusion_matrix(y_test, sig_clf.predict(X_test))


import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, confusion_matrix
import matplotlib.pyplot as plt

alpha = [x for x in range(1, 15, 2)]
cv_log_error_array = []

for i in alpha:
    k_cfl = KNeighborsClassifier(n_neighbors=i)
    k_cfl.fit(X_train, y_train)
    sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
    sig_clf.fit(X_train, y_train)
    predict_y = sig_clf.predict_proba(X_cv)
    # Remove eps parameter
    cv_log_error_array.append(log_loss(y_cv, predict_y, labels=k_cfl.classes_))

for i in range(len(cv_log_error_array)):
    print('log_loss for k = ', alpha[i], 'is', cv_log_error_array[i])

best_alpha = np.argmin(cv_log_error_array)

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array, c='g')
for i, txt in enumerate(np.round(cv_log_error_array, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

k_cfl = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
k_cfl.fit(X_train, y_train)
sig_clf = CalibratedClassifierCV(k_cfl, method="sigmoid")
sig_clf.fit(X_train, y_train)

predict_y = sig_clf.predict_proba(X_train)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:", log_loss(y_train, predict_y))
predict_y = sig_clf.predict_proba(X_cv)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:", log_loss(y_cv, predict_y))
predict_y = sig_clf.predict_proba(X_test)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:", log_loss(y_test, predict_y))

# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, sig_clf.predict(X_test))

plt.matshow(conf_matrix, cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
with open('models/KNeighborsClassifier.pkl', 'wb') as model_file:
    pickle.dump(sig_clf, model_file)


# In[1]:


# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# ------------------------------
# default parameters
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5,
# class_weight=None, warm_start=False, average=False, n_iter=None)

# some of methods
# fit(X, y[, coef_init, intercept_init, …])	Fit linear model with Stochastic Gradient Descent.
# predict(X)	Predict class labels for samples in X.


# alpha = [10 ** x for x in range(-5, 4)]
# cv_log_error_array=[]
# for i in alpha:
#     logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced')
#     logisticR.fit(X_train,y_train)
#     sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
#     sig_clf.fit(X_train, y_train)
#     predict_y = sig_clf.predict_proba(X_cv)
#     cv_log_error_array.append(log_loss(y_cv, predict_y, labels=logisticR.classes_, eps=1e-15))

# for i in range(len(cv_log_error_array)):
#     print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])

# best_alpha = np.argmin(cv_log_error_array)

# fig, ax = plt.subplots()
# ax.plot(alpha, cv_log_error_array,c='g')
# for i, txt in enumerate(np.round(cv_log_error_array,3)):
#     ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
# plt.grid()
# plt.title("Cross Validation Error for each alpha")
# plt.xlabel("Alpha i's")
# plt.ylabel("Error measure")
# plt.show()

# logisticR=LogisticRegression(penalty='l2',C=alpha[best_alpha],class_weight='balanced')
# logisticR.fit(X_train,y_train)
# sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
# sig_clf.fit(X_train, y_train)
# pred_y=sig_clf.predict(X_test)

# predict_y = sig_clf.predict_proba(X_train)
# print ('log loss for train data',log_loss(y_train, predict_y, labels=logisticR.classes_, eps=1e-15))
# predict_y = sig_clf.predict_proba(X_cv)
# print ('log loss for cv data',log_loss(y_cv, predict_y, labels=logisticR.classes_, eps=1e-15))
# predict_y = sig_clf.predict_proba(X_test)
# print ('log loss for test data',log_loss(y_test, predict_y, labels=logisticR.classes_, eps=1e-15))
# plot_confusion_matrix(y_test, sig_clf.predict(X_test))


import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, confusion_matrix
import matplotlib.pyplot as plt

alpha = [10 ** x for x in range(-5, 4)]
cv_log_error_array = []

for i in alpha:
    logisticR = LogisticRegression(penalty='l2', C=i, class_weight='balanced')
    logisticR.fit(X_train, y_train)
    sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
    sig_clf.fit(X_train, y_train)
    predict_y = sig_clf.predict_proba(X_cv)
    # Remove eps parameter
    cv_log_error_array.append(log_loss(y_cv, predict_y, labels=logisticR.classes_))

for i in range(len(cv_log_error_array)):
    print('log_loss for c = ', alpha[i], 'is', cv_log_error_array[i])

best_alpha = np.argmin(cv_log_error_array)

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array, c='g')
for i, txt in enumerate(np.round(cv_log_error_array, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

logisticR = LogisticRegression(penalty='l2', C=alpha[best_alpha], class_weight='balanced')
logisticR.fit(X_train, y_train)
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
sig_clf.fit(X_train, y_train)
pred_y = sig_clf.predict(X_test)

predict_y = sig_clf.predict_proba(X_train)
print('log loss for train data', log_loss(y_train, predict_y, labels=logisticR.classes_))
predict_y = sig_clf.predict_proba(X_cv)
print('log loss for cv data', log_loss(y_cv, predict_y, labels=logisticR.classes_))
predict_y = sig_clf.predict_proba(X_test)
print('log loss for test data', log_loss(y_test, predict_y, labels=logisticR.classes_))

# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, sig_clf.predict(X_test))

plt.matshow(conf_matrix, cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

with open('models/SGDClassifier.pkl', 'wb') as model_file:
    pickle.dump(sig_clf, model_file)


# In[2]:


# # --------------------------------
# # default parameters
# # sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, min_samples_split=2,
# # min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0,
# # min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False,
# # class_weight=None)

# # Some of methods of RandomForestClassifier()
# # fit(X, y, [sample_weight])	Fit the SVM model according to the given training data.
# # predict(X)	Perform classification on samples in X.
# # predict_proba (X)	Perform classification on samples in X.

# # some of attributes of  RandomForestClassifier()
# # feature_importances_ : array of shape = [n_features]
# # The feature importances (the higher, the more important the feature).

# alpha=[10,50,100,500,1000,2000,3000]
# cv_log_error_array=[]
# train_log_error_array=[]
# from sklearn.ensemble import RandomForestClassifier
# for i in alpha:
#     r_cfl=RandomForestClassifier(n_estimators=i,random_state=42,n_jobs=-1)
#     r_cfl.fit(X_train,y_train)
#     sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
#     sig_clf.fit(X_train, y_train)
#     predict_y = sig_clf.predict_proba(X_cv)
#     cv_log_error_array.append(log_loss(y_cv, predict_y, labels=r_cfl.classes_, eps=1e-15))

# for i in range(len(cv_log_error_array)):
#     print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])


# best_alpha = np.argmin(cv_log_error_array)

# fig, ax = plt.subplots()
# ax.plot(alpha, cv_log_error_array,c='g')
# for i, txt in enumerate(np.round(cv_log_error_array,3)):
#     ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
# plt.grid()
# plt.title("Cross Validation Error for each alpha")
# plt.xlabel("Alpha i's")
# plt.ylabel("Error measure")
# plt.show()


# r_cfl=RandomForestClassifier(n_estimators=alpha[best_alpha],random_state=42,n_jobs=-1)
# r_cfl.fit(X_train,y_train)
# sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
# sig_clf.fit(X_train, y_train)

# predict_y = sig_clf.predict_proba(X_train)
# print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y))
# predict_y = sig_clf.predict_proba(X_cv)
# print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y))
# predict_y = sig_clf.predict_proba(X_test)
# print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y))
# plot_confusion_matrix(y_test, sig_clf.predict(X_test))

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, confusion_matrix
import matplotlib.pyplot as plt

alpha = [10, 50, 100, 500, 1000, 2000, 3000]
cv_log_error_array = []
train_log_error_array = []

for i in alpha:
    r_cfl = RandomForestClassifier(n_estimators=i, random_state=42, n_jobs=-1)
    r_cfl.fit(X_train, y_train)
    sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
    sig_clf.fit(X_train, y_train)
    predict_y = sig_clf.predict_proba(X_cv)
    # Remove eps parameter
    cv_log_error_array.append(log_loss(y_cv, predict_y, labels=r_cfl.classes_))

for i in range(len(cv_log_error_array)):
    print('log_loss for c = ', alpha[i], 'is', cv_log_error_array[i])

best_alpha = np.argmin(cv_log_error_array)

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array, c='g')
for i, txt in enumerate(np.round(cv_log_error_array, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

r_cfl = RandomForestClassifier(n_estimators=alpha[best_alpha], random_state=42, n_jobs=-1)
r_cfl.fit(X_train, y_train)
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
sig_clf.fit(X_train, y_train)

predict_y = sig_clf.predict_proba(X_train)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:", log_loss(y_train, predict_y))
predict_y = sig_clf.predict_proba(X_cv)
print('For values of best alpha = ', alpha[best_alpha], "The cross-validation log loss is:", log_loss(y_cv, predict_y))
predict_y = sig_clf.predict_proba(X_test)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:", log_loss(y_test, predict_y))

# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, sig_clf.predict(X_test))

plt.matshow(conf_matrix, cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

with open('models/RandomForestClassifier.pkl', 'wb') as model_file:
    pickle.dump(sig_clf, model_file)


# In[3]:


# # Training a hyper-parameter tuned Xg-Boost regressor on our train data

# # find more about XGBClassifier function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#xgboost.XGBClassifier
# # -------------------------
# # default paramters
# # class xgboost.XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True,
# # objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1,
# # max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1,
# # scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, **kwargs)

# # some of methods of RandomForestRegressor()
# # fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None)
# # get_params([deep])	Get parameters for this estimator.
# # predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe.
# # get_score(importance_type='weight') -> get the feature importance


# alpha=[10,50,100,500,1000,2000]
# cv_log_error_array=[]
# for i in alpha:
#     x_cfl=XGBClassifier(n_estimators=i,nthread=-1)
#     x_cfl.fit(X_train,y_train)
#     sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
#     sig_clf.fit(X_train, y_train)
#     predict_y = sig_clf.predict_proba(X_cv)
#     cv_log_error_array.append(log_loss(y_cv, predict_y, labels=x_cfl.classes_, eps=1e-15))

# for i in range(len(cv_log_error_array)):
#     print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
# best_alpha = np.argmin(cv_log_error_array)
# fig, ax = plt.subplots()
# ax.plot(alpha, cv_log_error_array,c='g')
# for i, txt in enumerate(np.round(cv_log_error_array,3)):
#     ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
# plt.grid()
# plt.title("Cross Validation Error for each alpha")
# plt.xlabel("Alpha i's")
# plt.ylabel("Error measure")
# plt.show()
# x_cfl=XGBClassifier(n_estimators=alpha[best_alpha],nthread=-1)
# x_cfl.fit(X_train,y_train)
# sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
# sig_clf.fit(X_train, y_train)
# predict_y = sig_clf.predict_proba(X_train)
# print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y))
# predict_y = sig_clf.predict_proba(X_cv)
# print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y))
# predict_y = sig_clf.predict_proba(X_test)
# print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y))
# plot_confusion_matrix(y_test, sig_clf.predict(X_test))


import numpy as np
import matplotlib.pyplot as plt
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, confusion_matrix
from xgboost import XGBClassifier

# Adjust your class labels to start from 0
y_train_adjusted = y_train - 1
y_cv_adjusted = y_cv - 1
y_test_adjusted = y_test - 1

alpha = [10, 50, 100, 500, 1000, 2000]
cv_log_error_array = []

for i in alpha:
    x_cfl = XGBClassifier(n_estimators=i, nthread=-1)
    x_cfl.fit(X_train, y_train_adjusted)
    sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
    sig_clf.fit(X_train, y_train_adjusted)
    predict_y = sig_clf.predict_proba(X_cv)
    cv_log_error_array.append(log_loss(y_cv_adjusted, predict_y, labels=x_cfl.classes_))

for i in range(len(cv_log_error_array)):
    print('log_loss for c = ', alpha[i], 'is', cv_log_error_array[i])

best_alpha = np.argmin(cv_log_error_array)

fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array, c='g')
for i, txt in enumerate(np.round(cv_log_error_array, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()

x_cfl = XGBClassifier(n_estimators=alpha[best_alpha], nthread=-1)
x_cfl.fit(X_train, y_train_adjusted)
sig_clf = CalibratedClassifierCV(x_cfl, method="sigmoid")
sig_clf.fit(X_train, y_train_adjusted)

predict_y = sig_clf.predict_proba(X_train)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:", log_loss(y_train_adjusted, predict_y))
predict_y = sig_clf.predict_proba(X_cv)
print('For values of best alpha = ', alpha[best_alpha], "The cross-validation log loss is:", log_loss(y_cv_adjusted, predict_y))
predict_y = sig_clf.predict_proba(X_test)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:", log_loss(y_test_adjusted, predict_y))

# Plot confusion matrix
conf_matrix = confusion_matrix(y_test_adjusted, sig_clf.predict(X_test))

plt.matshow(conf_matrix, cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
with open('models/XGBClassifier.pkl', 'wb') as model_file:
    pickle.dump(sig_clf, model_file)


# In[4]:


# # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
# x_cfl=XGBClassifier()

# prams={
#     'learning_rate':[0.01,0.03,0.05,0.1,0.15,0.2],
#      'n_estimators':[100,200,500,1000,2000],
#      'max_depth':[3,5,10],
#     'colsample_bytree':[0.1,0.3,0.5,1],
#     'subsample':[0.1,0.3,0.5,1]
# }
# random_cfl1=RandomizedSearchCV(x_cfl,param_distributions=prams,verbose=10,n_jobs=-1,)
# random_cfl1.fit(X_train,y_train)


from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Adjust your class labels to start from 0
y_train_adjusted = y_train - 1
y_cv_adjusted = y_cv - 1
y_test_adjusted = y_test - 1

x_cfl = XGBClassifier()

params = {
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2],
    'n_estimators': [100, 200, 500, 1000, 2000],
    'max_depth': [3, 5, 10],
    'colsample_bytree': [0.1, 0.3, 0.5, 1],
    'subsample': [0.1, 0.3, 0.5, 1]
}

random_cfl1 = RandomizedSearchCV(x_cfl, param_distributions=params, verbose=10, n_jobs=-1)
random_cfl1.fit(X_train, y_train_adjusted)

print(f"Best Parameters: {random_cfl1.best_params_}")
print(f"Best Score: {random_cfl1.best_score_}")