Ransomeware Detection with AI
Author: David Ezeani
Import all Libraries
In [37]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LinearRegression
import numpy as np
Data Review and Clean up
In [8]:
#load dataset file
df = pd.read_csv("ransomware_data_file.csv")
In [9]:
#check dataset dimension
print("Dataset Shape:", df.shape)
Dataset Shape: (62485, 18)
In [10]:
#check datafile metadata
print("Dataset Info:")
print(df.info())
Dataset Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 62485 entries, 0 to 62484 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FileName 62485 non-null object 1 md5Hash 62485 non-null object 2 Machine 62485 non-null int64 3 DebugSize 62485 non-null int64 4 DebugRVA 62485 non-null int64 5 MajorImageVersion 62485 non-null int64 6 MajorOSVersion 62485 non-null int64 7 ExportRVA 62485 non-null int64 8 ExportSize 62485 non-null int64 9 IatVRA 62485 non-null int64 10 MajorLinkerVersion 62485 non-null int64 11 MinorLinkerVersion 62485 non-null int64 12 NumberOfSections 62485 non-null int64 13 SizeOfStackReserve 62485 non-null int64 14 DllCharacteristics 62485 non-null int64 15 ResourceSize 62485 non-null int64 16 BitcoinAddresses 62485 non-null int64 17 Benign 62485 non-null int64 dtypes: int64(16), object(2) memory usage: 8.6+ MB None
In [11]:
#show 5 columns sample of dataset
df.sample(5)
Out[11]:
| FileName | md5Hash | Machine | DebugSize | DebugRVA | MajorImageVersion | MajorOSVersion | ExportRVA | ExportSize | IatVRA | MajorLinkerVersion | MinorLinkerVersion | NumberOfSections | SizeOfStackReserve | DllCharacteristics | ResourceSize | BitcoinAddresses | Benign | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 51895 | VirusShare_7e4483c7432a30d7df8b8b09fb692694 | 7e4483c7432a30d7df8b8b09fb692694 | 332 | 28 | 37248 | 0 | 5 | 0 | 0 | 36864 | 9 | 0 | 4 | 1048576 | 32768 | 191772 | 0 | 0 |
| 19218 | KBDTIFI2 (3).DLL | 5c2c14f2cf5ad00a8a7986fddd44c8e9 | 34404 | 84 | 11856 | 10 | 10 | 11760 | 84 | 0 | 14 | 10 | 4 | 262144 | 16736 | 1056 | 0 | 1 |
| 50115 | VirusShare_4090ce6f23e946930cfbabffbb29325e | 4090ce6f23e946930cfbabffbb29325e | 332 | 0 | 0 | 0 | 5 | 0 | 0 | 253952 | 9 | 0 | 10 | 1048576 | 0 | 5640 | 0 | 0 |
| 10309 | Microsoft.TeamFoundation.VersionControl.Common... | 04b301e4b15c186c069930c488f92b7a | 332 | 0 | 0 | 0 | 4 | 0 | 0 | 8192 | 11 | 0 | 3 | 1048576 | 34112 | 1464 | 0 | 1 |
| 44706 | VirusShare_af4c9a5042f42a05f39751ab11ecd6b0 | af4c9a5042f42a05f39751ab11ecd6b0 | 332 | 56 | 44194 | 5 | 5 | 0 | 0 | 4096 | 6 | 20 | 4 | 262144 | 34816 | 37160 | 0 | 0 |
In [12]:
# Check for duplicate values
df.duplicated().sum()
Out[12]:
np.int64(0)
In [13]:
# Check for missing values
print("\nMissing Values in the Dataset:")
print(df.isnull().sum())
Missing Values in the Dataset: FileName 0 md5Hash 0 Machine 0 DebugSize 0 DebugRVA 0 MajorImageVersion 0 MajorOSVersion 0 ExportRVA 0 ExportSize 0 IatVRA 0 MajorLinkerVersion 0 MinorLinkerVersion 0 NumberOfSections 0 SizeOfStackReserve 0 DllCharacteristics 0 ResourceSize 0 BitcoinAddresses 0 Benign 0 dtype: int64
In [14]:
df.drop(columns =["DebugRVA","MajorImageVersion","MajorOSVersion","ExportRVA","ExportSize","IatVRA","MajorLinkerVersion","MinorLinkerVersion","NumberOfSections","SizeOfStackReserve","ResourceSize","BitcoinAddresses"], inplace = True)
In [15]:
df.sample(5)
Out[15]:
| FileName | md5Hash | Machine | DebugSize | DllCharacteristics | Benign | |
|---|---|---|---|---|---|---|
| 13774 | Microsoft.AnalysisServices.Common.Wizard.resou... | 8f364da7eae118a585868dd4e90975ed | 332 | 0 | 34112 | 1 |
| 34382 | VirusShare_dc25a615a33101aa2904ec96a64ba722 | dc25a615a33101aa2904ec96a64ba722 | 332 | 0 | 0 | 0 |
| 183 | git-http-backend.exe | 9b393204a12178a5dac65e0f7e63735f | 34404 | 28 | 320 | 1 |
| 40991 | VirusShare_ee2272af249d12a0bb5e5de6cdc971f0 | ee2272af249d12a0bb5e5de6cdc971f0 | 332 | 0 | 0 | 0 |
| 9454 | Microsoft.CodeAnalysis.VisualBasic.EditorFeatu... | 8fe0cc53b9f1c49c2aab19c7d9d20618 | 332 | 56 | 34144 | 1 |
EDA - Exploratory Data Analysis
In [16]:
df['Benign'].value_counts()
Out[16]:
Benign 0 35367 1 27118 Name: count, dtype: int64
In [17]:
df['DebugSize'].value_counts()
Out[17]:
DebugSize 0 36521 28 12719 84 8974 56 4087 112 89 1 55 536 9 257 6 32 3 30 3 512 3 140 2 1056 1 276 1 17 1 4 1 135 1 10 1 1615155235 1 16 1 256 1 109 1 1236 1 102 1 49 1 1072 1 Name: count, dtype: int64
In [29]:
df.max()
Out[29]:
FileName ztrace_maps (3).dll md5Hash ffffad120581ecad433eb5a3b403b6ac Machine 43620 DebugSize 1615155235 DllCharacteristics 58632 Benign 1 dtype: object
In [76]:
plt.pie(df['Benign'].value_counts(), labels = ['Ransomware', 'Benign'], autopct = '%0.2f')
Out[76]:
([<matplotlib.patches.Wedge at 0x24514e29460>, <matplotlib.patches.Wedge at 0x24514e320f0>], [Text(-0.22647549309674733, 1.0764333936786732, 'Ransomware'), Text(0.22647545888068688, -1.076433400877538, 'Benign')], [Text(-0.12353208714368034, 0.5871454874610944, '56.60'), Text(0.12353206848037465, -0.587145491387748, '43.40')])
In [77]:
# Visualize the target variable distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Benign', data=df)
plt.title('Distribution of Ransomware vs Benign Samples')
plt.xlabel('Label (0: Ransomware, 1: Benign)')
plt.ylabel('Count')
plt.show()
In [32]:
#check relationship betweeen debug size and benign/malware state using a potion of the data
df_portion = df.sample(frac=0.5, random_state = 50)
sns.jointplot(data = df_portion, x="DebugSize", y="Benign", kind="scatter", alpha=0.3)
plt.show()
In [25]:
#check relationship betweeen dll characteristics and benign/malware state using a potion of the data
df_portion = df.sample(frac=0.5, random_state = 45)
sns.jointplot(data = df_portion, x="DllCharacteristics", y="Benign", kind="scatter")
plt.show()
In [19]:
#select numeric datatypes
numeric_df = df.select_dtypes(include=["number"])
In [20]:
# Check for correlations between numeric features
plt.figure(figsize=(12, 10))
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.show()
In [17]:
sns.pairplot(df, hue='Benign')
plt.show()
Model Building
In [21]:
# Define features and target variable
X = df.drop('Benign', axis=1)
y = df['Benign']
In [22]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [23]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=["number"]))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=["number"]))
In [24]:
# Perform PCA to visualize data in 2D
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
plt.figure(figsize=(10, 8))
plt.scatter(X_train_pca[y_train == 0, 0], X_train_pca[y_train == 0, 1], label='Benign', alpha=0.6)
plt.scatter(X_train_pca[y_train == 1, 0], X_train_pca[y_train == 1, 1], label='Ransomware', alpha=0.6)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Ransomware Dataset')
plt.legend()
plt.show()
In [26]:
# Train a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
Out[26]:
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [30]:
# Make predictions on the test set
RF_y_predictions = rf_model.predict(X_test_scaled)
In [32]:
#comparing predictions of the RF Classifier with y test
sns.scatterplot(x=RF_y_predictions, y= y_test)
plt.xlabel ("Predictions")
plt.ylabel("Actual Test Values")
plt.title("Comparison of Predictions with True Values")
plt.show()
In [31]:
# Evaluate the RF classifier model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Confusion Matrix:
[[10446 232]
[ 191 7877]]
Classification Report:
precision recall f1-score support
0 0.98 0.98 0.98 10678
1 0.97 0.98 0.97 8068
accuracy 0.98 18746
macro avg 0.98 0.98 0.98 18746
weighted avg 0.98 0.98 0.98 18746
In [33]:
# Feature importance analysis of RF classifier
feature_importance = rf_model.feature_importances_
features = X.select_dtypes(include=["number"]).columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance in Random Forest Classifier')
plt.show()
Model Evaluation
In [34]:
# We can then define a fuction that takes the object as an argument alongside the training and test data
def train_classifier(clf, X_train, y_train, X_test, y_test):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division = 0)
recall = recall_score(y_test, y_pred, zero_division = 0)
f1 = f1_score(y_test, y_pred, zero_division = 0)
roc = roc_auc_score(y_test, y_pred)
return accuracy, precision, recall, f1, roc
In [38]:
# Test the training function with the model
train_classifier(rf_model, X_train.select_dtypes(include=["number"]), y_train, X_test.select_dtypes(include=["number"]), y_test)
Out[38]:
(0.9774351861730503, np.float64(0.9713898137871501), np.float64(0.9763262270699058), np.float64(0.9738517648513322), np.float64(0.9772996559586278))
In [42]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_scores = []
current_accuracy, current_precision, current_recall, current_f1, current_roc = train_classifier(rf_model, X_train.select_dtypes(include=["number"]), y_train, X_test.select_dtypes(include=["number"]), y_test)
print('Accuracy: ', current_accuracy)
print('Precision: ', current_precision)
print('Recall: ', current_recall)
print('F1: ', current_f1)
print('ROC: ', current_roc)
print('\n')
accuracy_scores.append(current_accuracy)
precision_scores.append(current_precision)
recall_scores.append(current_recall)
f1_scores.append(current_f1)
roc_scores.append(current_roc)
Accuracy: 0.9774351861730503 Precision: 0.9713898137871501 Recall: 0.9763262270699058 F1: 0.9738517648513322 ROC: 0.9772996559586278
In [59]:
performance_df = pd.DataFrame({'Metric': ['Accuracy','Precision','Recall','F1','ROC'], 'Performance': [current_accuracy, current_precision, current_recall, current_f1, current_roc]})
In [60]:
performance_df
Out[60]:
| Metric | Performance | |
|---|---|---|
| 0 | Accuracy | 0.977435 |
| 1 | Precision | 0.971390 |
| 2 | Recall | 0.976326 |
| 3 | F1 | 0.973852 |
| 4 | ROC | 0.977300 |
In [75]:
sns.catplot(x = 'Metric', y = 'Performance', data = performance_df, kind = 'bar', palette='husl', height = 5)
plt.ylim(0.5, 1.0)
plt.xticks(rotation = 'vertical')
plt.show()
C:\Users\HP\AppData\Local\Temp\ipykernel_12324\1010139633.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.catplot(x = 'Metric', y = 'Performance', data = performance_df, kind = 'bar', palette='husl', height = 5)
In [ ]: