import pandas as pd

df1 = pd.read_csv('latest_malware_bytes_predictions_KNeighbours.csv')
df2 = pd.read_csv('latest_malware_bytes_predictions_RandomForest.csv')
df3 = pd.read_csv('latest_malware_bytes_predictions_SGD.csv')
df4 = pd.read_csv('latest_malware_bytes_predictions_XGB.csv')

# Step 2: Create a new DataFrame to hold combined results
combined_data1 = pd.DataFrame()

# Step 3: Combine predictions
combined_data1['File'] = df1['File']  # Assuming all files are the same
combined_data1['Predicted Class'] = df1['Predicted Class']  # Placeholder
combined_data1['Prediction Probability'] = 0.0  # Initialize probability column

# Step 4: Loop through each row and calculate the highest probability and average
for i in range(len(df1)):
    # Get probabilities from all models
    probs = [
        df1['Prediction Probability'][i],
        df2['Prediction Probability'][i],
        df3['Prediction Probability'][i],
        df4['Prediction Probability'][i],
    ]
    
    # Get predicted classes
    classes = [
        df1['Predicted Class'][i],
        df2['Predicted Class'][i],
        df3['Predicted Class'][i],
        df4['Predicted Class'][i],
    ]
    
    # Find the index of the highest probability
    max_index = probs.index(max(probs))
    
    # Set the highest predicted class
    combined_data1.at[i, 'Predicted Class'] = classes[max_index]
    
    # Calculate the average probability
    combined_data1.at[i, 'Prediction Probability'] = sum(probs) / len(probs)

print(combined_data1)

df5 = pd.read_csv('latest_malware_ASM_predictions_KNeighbours.csv')
df6 = pd.read_csv('latest_malware_ASM_predictions_LogisticRegression.csv')
df7 = pd.read_csv('latest_malware_ASM_predictions_RandomForest.csv')
df8 = pd.read_csv('latest_malware_ASM_predictions_XGB.csv')

combined_data2 = pd.DataFrame()

# Step 3: Combine predictions
combined_data2['File'] = df5['File']  # Assuming all files are the same
combined_data2['Predicted Class'] = df5['Predicted Class']  # Placeholder
combined_data2['Prediction Probability'] = 0.0  # Initialize probability column

# Step 4: Loop through each row and calculate the highest probability and average
for i in range(len(df5)):
    # Get probabilities from all models
    probs = [
        df5['Prediction Probability'][i],
        df6['Prediction Probability'][i],
        df7['Prediction Probability'][i],
        df8['Prediction Probability'][i],
    ]
    
    # Get predicted classes
    classes = [
        df5['Predicted Class'][i],
        df6['Predicted Class'][i],
        df7['Predicted Class'][i],
        df8['Predicted Class'][i],
    ]
    
    # Find the index of the highest probability
    max_index = probs.index(max(probs))
    
    # Set the highest predicted class
    combined_data2.at[i, 'Predicted Class'] = classes[max_index]
    
    # Calculate the average probability
    combined_data2.at[i, 'Prediction Probability'] = sum(probs) / len(probs)

print(combined_data2)

combined_data = pd.concat([combined_data1, combined_data2], ignore_index=True)