Final_Installer_Merged/test_data.py

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from datetime import datetime

TEST_DATA_PATH = 'combined_log_summary.csv'
VARIABLE_NAMES_PATH = 'output.txt'

# Load the trained model
model = tf.keras.models.load_model('updated_ransomware_classifier.h5')

# Load and prepare test data
# Read variable names
with open(VARIABLE_NAMES_PATH, encoding='utf-8') as f:
    columns = [line.split(';')[1].strip() for line in f]

# Load test data
data = pd.read_csv(TEST_DATA_PATH, header=None, names=columns)

# Check and clean column names
data.columns = data.columns.str.strip()
print("Columns in DataFrame:", data.columns)

# Drop features that are all zero and label column
try:
    # data = data.loc[:, (data != 0).any(axis=0)]

    #drop features that are all label and start the model training.
    X_data = data.drop('Label (1 Ransomware / 0 Goodware)', axis=1)  # Features
    X = X_data.drop('Ransomware Family', axis=1)
    # X = X_data
    # print(X)
    y = data['Label (1 Ransomware / 0 Goodware)']  # Labels
    # X = X.loc[:, (data != 0).any(axis=0)]

except KeyError as e:
    print(f"Error: {e}")
    print("Available columns:", data.columns)
    raise

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Make predictions
predictions = model.predict(X)
predicted_labels = (predictions > 0.5).astype(int)
true_labels = y.values

# Convert predictions to "Yes" or "No"
predicted_labels_text = ['Yes' if label == 1 else 'No' for label in predicted_labels.flatten()]
true_labels_text = ['Yes' if label == 1 else 'No' for label in true_labels]

# Get current timestamp
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# Evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
conf_matrix = confusion_matrix(true_labels, predicted_labels)
class_report = classification_report(true_labels, predicted_labels)

print(f"Test Accuracy ({timestamp}): {accuracy:.2f}")
print(f"\nConfusion Matrix ({timestamp}):")
print(conf_matrix)
print(f"\nClassification Report ({timestamp}):")
print(class_report)

# Print the first few predictions and true labels with timestamp
print(f"\nSample Predictions vs True Labels ({timestamp}):")
for i in range(10):  # Adjust the range as needed
    print(f"Sample {i}: Predicted = {predicted_labels_text[i]}, True = {true_labels_text[i]}")

# Save predictions and true labels to a CSV file with timestamp
output_df = pd.DataFrame({
    'Timestamp': [timestamp] * len(predicted_labels_text),  # Add timestamp column
    'Predicted Label': predicted_labels_text,
    'True Label': true_labels_text
})

output_file = f'prediction_{timestamp}.csv'
output_df.to_csv(output_file, index=False)
print(f"Predictions saved to {output_file} ({timestamp})")