Ransomeware Detection with AI

Author: David Ezeani

Import all Libraries

In [37]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LinearRegression 
import numpy as np

Data Review and Clean up

In [8]:
#load dataset file
df = pd.read_csv("ransomware_data_file.csv")
In [9]:
#check dataset dimension
print("Dataset Shape:", df.shape)
Dataset Shape: (62485, 18)
In [10]:
#check datafile metadata
print("Dataset Info:")
print(df.info())
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62485 entries, 0 to 62484
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   FileName            62485 non-null  object
 1   md5Hash             62485 non-null  object
 2   Machine             62485 non-null  int64 
 3   DebugSize           62485 non-null  int64 
 4   DebugRVA            62485 non-null  int64 
 5   MajorImageVersion   62485 non-null  int64 
 6   MajorOSVersion      62485 non-null  int64 
 7   ExportRVA           62485 non-null  int64 
 8   ExportSize          62485 non-null  int64 
 9   IatVRA              62485 non-null  int64 
 10  MajorLinkerVersion  62485 non-null  int64 
 11  MinorLinkerVersion  62485 non-null  int64 
 12  NumberOfSections    62485 non-null  int64 
 13  SizeOfStackReserve  62485 non-null  int64 
 14  DllCharacteristics  62485 non-null  int64 
 15  ResourceSize        62485 non-null  int64 
 16  BitcoinAddresses    62485 non-null  int64 
 17  Benign              62485 non-null  int64 
dtypes: int64(16), object(2)
memory usage: 8.6+ MB
None
In [11]:
#show 5 columns sample of dataset
df.sample(5)
Out[11]:
FileName md5Hash Machine DebugSize DebugRVA MajorImageVersion MajorOSVersion ExportRVA ExportSize IatVRA MajorLinkerVersion MinorLinkerVersion NumberOfSections SizeOfStackReserve DllCharacteristics ResourceSize BitcoinAddresses Benign
51895 VirusShare_7e4483c7432a30d7df8b8b09fb692694 7e4483c7432a30d7df8b8b09fb692694 332 28 37248 0 5 0 0 36864 9 0 4 1048576 32768 191772 0 0
19218 KBDTIFI2 (3).DLL 5c2c14f2cf5ad00a8a7986fddd44c8e9 34404 84 11856 10 10 11760 84 0 14 10 4 262144 16736 1056 0 1
50115 VirusShare_4090ce6f23e946930cfbabffbb29325e 4090ce6f23e946930cfbabffbb29325e 332 0 0 0 5 0 0 253952 9 0 10 1048576 0 5640 0 0
10309 Microsoft.TeamFoundation.VersionControl.Common... 04b301e4b15c186c069930c488f92b7a 332 0 0 0 4 0 0 8192 11 0 3 1048576 34112 1464 0 1
44706 VirusShare_af4c9a5042f42a05f39751ab11ecd6b0 af4c9a5042f42a05f39751ab11ecd6b0 332 56 44194 5 5 0 0 4096 6 20 4 262144 34816 37160 0 0
In [12]:
# Check for duplicate values

df.duplicated().sum()
Out[12]:
np.int64(0)
In [13]:
# Check for missing values
print("\nMissing Values in the Dataset:")
print(df.isnull().sum())
Missing Values in the Dataset:
FileName              0
md5Hash               0
Machine               0
DebugSize             0
DebugRVA              0
MajorImageVersion     0
MajorOSVersion        0
ExportRVA             0
ExportSize            0
IatVRA                0
MajorLinkerVersion    0
MinorLinkerVersion    0
NumberOfSections      0
SizeOfStackReserve    0
DllCharacteristics    0
ResourceSize          0
BitcoinAddresses      0
Benign                0
dtype: int64
In [14]:
df.drop(columns =["DebugRVA","MajorImageVersion","MajorOSVersion","ExportRVA","ExportSize","IatVRA","MajorLinkerVersion","MinorLinkerVersion","NumberOfSections","SizeOfStackReserve","ResourceSize","BitcoinAddresses"], inplace = True)
In [15]:
df.sample(5)
Out[15]:
FileName md5Hash Machine DebugSize DllCharacteristics Benign
13774 Microsoft.AnalysisServices.Common.Wizard.resou... 8f364da7eae118a585868dd4e90975ed 332 0 34112 1
34382 VirusShare_dc25a615a33101aa2904ec96a64ba722 dc25a615a33101aa2904ec96a64ba722 332 0 0 0
183 git-http-backend.exe 9b393204a12178a5dac65e0f7e63735f 34404 28 320 1
40991 VirusShare_ee2272af249d12a0bb5e5de6cdc971f0 ee2272af249d12a0bb5e5de6cdc971f0 332 0 0 0
9454 Microsoft.CodeAnalysis.VisualBasic.EditorFeatu... 8fe0cc53b9f1c49c2aab19c7d9d20618 332 56 34144 1

EDA - Exploratory Data Analysis

In [16]:
df['Benign'].value_counts()
Out[16]:
Benign
0    35367
1    27118
Name: count, dtype: int64
In [17]:
df['DebugSize'].value_counts()
Out[17]:
DebugSize
0             36521
28            12719
84             8974
56             4087
112              89
1                55
536               9
257               6
32                3
30                3
512               3
140               2
1056              1
276               1
17                1
4                 1
135               1
10                1
1615155235        1
16                1
256               1
109               1
1236              1
102               1
49                1
1072              1
Name: count, dtype: int64
In [29]:
df.max()
Out[29]:
FileName                           ztrace_maps (3).dll
md5Hash               ffffad120581ecad433eb5a3b403b6ac
Machine                                          43620
DebugSize                                   1615155235
DllCharacteristics                               58632
Benign                                               1
dtype: object
In [76]:
plt.pie(df['Benign'].value_counts(), labels = ['Ransomware', 'Benign'], autopct = '%0.2f')
Out[76]:
([<matplotlib.patches.Wedge at 0x24514e29460>,
  <matplotlib.patches.Wedge at 0x24514e320f0>],
 [Text(-0.22647549309674733, 1.0764333936786732, 'Ransomware'),
  Text(0.22647545888068688, -1.076433400877538, 'Benign')],
 [Text(-0.12353208714368034, 0.5871454874610944, '56.60'),
  Text(0.12353206848037465, -0.587145491387748, '43.40')])
No description has been provided for this image
In [77]:
# Visualize the target variable distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Benign', data=df)
plt.title('Distribution of Ransomware vs Benign Samples')
plt.xlabel('Label (0: Ransomware, 1: Benign)')
plt.ylabel('Count')
plt.show()
No description has been provided for this image
In [32]:
#check relationship betweeen debug size and benign/malware state using a potion of the data
df_portion = df.sample(frac=0.5, random_state = 50)
sns.jointplot(data = df_portion, x="DebugSize", y="Benign", kind="scatter", alpha=0.3)
plt.show()
No description has been provided for this image
In [25]:
#check relationship betweeen dll characteristics and benign/malware state using a potion of the data
df_portion = df.sample(frac=0.5, random_state = 45)
sns.jointplot(data = df_portion, x="DllCharacteristics", y="Benign", kind="scatter")
plt.show()
No description has been provided for this image
In [19]:
#select numeric datatypes
numeric_df = df.select_dtypes(include=["number"])
In [20]:
# Check for correlations between numeric features
plt.figure(figsize=(12, 10))
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.show()
No description has been provided for this image
In [17]:
sns.pairplot(df, hue='Benign')
plt.show()
No description has been provided for this image

Model Building

In [21]:
# Define features and target variable
X = df.drop('Benign', axis=1)
y = df['Benign']
In [22]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [23]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=["number"]))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=["number"]))
In [24]:
# Perform PCA to visualize data in 2D
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

plt.figure(figsize=(10, 8))
plt.scatter(X_train_pca[y_train == 0, 0], X_train_pca[y_train == 0, 1], label='Benign', alpha=0.6)
plt.scatter(X_train_pca[y_train == 1, 0], X_train_pca[y_train == 1, 1], label='Ransomware', alpha=0.6)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Ransomware Dataset')
plt.legend()
plt.show()
No description has been provided for this image
In [26]:
# Train a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
Out[26]:
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [30]:
# Make predictions on the test set
RF_y_predictions = rf_model.predict(X_test_scaled)
In [32]:
#comparing predictions of the RF Classifier with y test
sns.scatterplot(x=RF_y_predictions, y= y_test)
plt.xlabel ("Predictions")
plt.ylabel("Actual Test Values")
plt.title("Comparison of Predictions with True Values")
plt.show()
No description has been provided for this image
In [31]:
# Evaluate the RF classifier model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Confusion Matrix:
[[10446   232]
 [  191  7877]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     10678
           1       0.97      0.98      0.97      8068

    accuracy                           0.98     18746
   macro avg       0.98      0.98      0.98     18746
weighted avg       0.98      0.98      0.98     18746

In [33]:
# Feature importance analysis of RF classifier
feature_importance = rf_model.feature_importances_
features = X.select_dtypes(include=["number"]).columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance in Random Forest Classifier')
plt.show()
No description has been provided for this image

Model Evaluation

In [34]:
# We can then define a fuction that takes the object as an argument alongside the training and test data
def train_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division = 0)
    recall = recall_score(y_test, y_pred, zero_division = 0)
    f1 = f1_score(y_test, y_pred, zero_division = 0)
    roc = roc_auc_score(y_test, y_pred)

    return accuracy, precision, recall, f1, roc
In [38]:
# Test the training function with the model

train_classifier(rf_model, X_train.select_dtypes(include=["number"]), y_train, X_test.select_dtypes(include=["number"]), y_test)
Out[38]:
(0.9774351861730503,
 np.float64(0.9713898137871501),
 np.float64(0.9763262270699058),
 np.float64(0.9738517648513322),
 np.float64(0.9772996559586278))
In [42]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_scores = []

current_accuracy, current_precision, current_recall, current_f1, current_roc = train_classifier(rf_model, X_train.select_dtypes(include=["number"]), y_train, X_test.select_dtypes(include=["number"]), y_test)

print('Accuracy: ', current_accuracy)
print('Precision: ', current_precision)
print('Recall: ', current_recall)
print('F1: ', current_f1)
print('ROC: ', current_roc)
print('\n')

accuracy_scores.append(current_accuracy)
precision_scores.append(current_precision)
recall_scores.append(current_recall)
f1_scores.append(current_f1)
roc_scores.append(current_roc)
Accuracy:  0.9774351861730503
Precision:  0.9713898137871501
Recall:  0.9763262270699058
F1:  0.9738517648513322
ROC:  0.9772996559586278


In [59]:
performance_df = pd.DataFrame({'Metric': ['Accuracy','Precision','Recall','F1','ROC'], 'Performance': [current_accuracy, current_precision, current_recall, current_f1, current_roc]})
In [60]:
performance_df
Out[60]:
Metric Performance
0 Accuracy 0.977435
1 Precision 0.971390
2 Recall 0.976326
3 F1 0.973852
4 ROC 0.977300
In [75]:
sns.catplot(x = 'Metric', y = 'Performance', data = performance_df, kind = 'bar', palette='husl', height = 5)

plt.ylim(0.5, 1.0)
plt.xticks(rotation = 'vertical')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_12324\1010139633.py:1: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.catplot(x = 'Metric', y = 'Performance', data = performance_df, kind = 'bar', palette='husl', height = 5)
No description has been provided for this image
In [ ]: